mirror of
https://gitee.com/ledc/IYUUAutoReseed
synced 2025-06-11 19:28:57 +00:00
IYUU自动辅种工具初始化版本
This commit is contained in:
52
vendor/owner888/phpspider/README.md
vendored
Normal file
52
vendor/owner888/phpspider/README.md
vendored
Normal file
@ -0,0 +1,52 @@
|
||||
# phpspider -- PHP蜘蛛爬虫框架
|
||||
《我用爬虫一天时间“偷了”知乎一百万用户,只为证明PHP是世界上最好的语言 》所使用的程序
|
||||
|
||||
phpspider是一个爬虫开发框架。使用本框架,你不用了解爬虫的底层技术实现,爬虫被网站屏蔽、有些网站需要登录或验证码识别才能爬取等问题。简单几行PHP代码,就可以创建自己的爬虫,利用框架封装的多进程Worker类库,代码更简洁,执行效率更高速度更快。
|
||||
|
||||
demo目录下有一些特定网站的爬取规则,只要你安装了PHP环境,代码就可以在命令行下直接跑。 对爬虫感兴趣的开发者可以加QQ群一起讨论:147824717。
|
||||
|
||||
下面以糗事百科为例, 来看一下我们的爬虫长什么样子:
|
||||
|
||||
```
|
||||
$configs = array(
|
||||
'name' => '糗事百科',
|
||||
'domains' => array(
|
||||
'qiushibaike.com',
|
||||
'www.qiushibaike.com'
|
||||
),
|
||||
'scan_urls' => array(
|
||||
'http://www.qiushibaike.com/'
|
||||
),
|
||||
'content_url_regexes' => array(
|
||||
"http://www.qiushibaike.com/article/\d+"
|
||||
),
|
||||
'list_url_regexes' => array(
|
||||
"http://www.qiushibaike.com/8hr/page/\d+\?s=\d+"
|
||||
),
|
||||
'fields' => array(
|
||||
array(
|
||||
// 抽取内容页的文章内容
|
||||
'name' => "article_content",
|
||||
'selector' => "//*[@id='single-next-link']",
|
||||
'required' => true
|
||||
),
|
||||
array(
|
||||
// 抽取内容页的文章作者
|
||||
'name' => "article_author",
|
||||
'selector' => "//div[contains(@class,'author')]//h2",
|
||||
'required' => true
|
||||
),
|
||||
),
|
||||
);
|
||||
$spider = new phpspider($configs);
|
||||
$spider->start();
|
||||
```
|
||||
爬虫的整体框架就是这样, 首先定义了一个$configs数组, 里面设置了待爬网站的一些信息, 然后通过调用```$spider = new phpspider($configs);```和```$spider->start();```来配置并启动爬虫.
|
||||
|
||||
#### 运行界面如下:
|
||||
|
||||

|
||||
|
||||
更多详细内容,移步到:
|
||||
|
||||
[开发文档](http://doc.phpspider.org)
|
77
vendor/owner888/phpspider/autoloader.php
vendored
Normal file
77
vendor/owner888/phpspider/autoloader.php
vendored
Normal file
@ -0,0 +1,77 @@
|
||||
<?php
|
||||
/**
|
||||
* This file is part of phpspider.
|
||||
*
|
||||
* Licensed under The MIT License
|
||||
* For full copyright and license information, please see the MIT-LICENSE.txt
|
||||
* Redistributions of files must retain the above copyright notice.
|
||||
*
|
||||
* @author seatle<seatle@foxmail.com>
|
||||
* @copyright seatle<seatle@foxmail.com>
|
||||
* @link http://www.phpspider.org/
|
||||
* @license http://www.opensource.org/licenses/mit-license.php MIT License
|
||||
*/
|
||||
namespace phpspider;
|
||||
|
||||
/**
|
||||
* autoloader.
|
||||
*/
|
||||
class autoloader
|
||||
{
|
||||
/**
|
||||
* Autoload root path.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
protected static $_autoload_root_path = '';
|
||||
|
||||
/**
|
||||
* Set autoload root path.
|
||||
*
|
||||
* @param string $root_path
|
||||
* @return void
|
||||
*/
|
||||
public static function set_root_path($root_path)
|
||||
{
|
||||
self::$_autoload_root_path = $root_path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load files by namespace.
|
||||
*
|
||||
* @param string $name
|
||||
* @return boolean
|
||||
*/
|
||||
public static function load_by_namespace($name)
|
||||
{
|
||||
$class_path = str_replace('\\', DIRECTORY_SEPARATOR, $name);
|
||||
|
||||
if (strpos($name, 'phpspider\\') === 0)
|
||||
{
|
||||
$class_file = __DIR__ . substr($class_path, strlen('phpspider')) . '.php';
|
||||
}
|
||||
else
|
||||
{
|
||||
if (self::$_autoload_root_path)
|
||||
{
|
||||
$class_file = self::$_autoload_root_path . DIRECTORY_SEPARATOR . $class_path . '.php';
|
||||
}
|
||||
if (empty($class_file) || !is_file($class_file))
|
||||
{
|
||||
$class_file = __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . "$class_path.php";
|
||||
}
|
||||
}
|
||||
|
||||
if (is_file($class_file))
|
||||
{
|
||||
require_once($class_file);
|
||||
if (class_exists($name, false))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
spl_autoload_register('\phpspider\autoloader::load_by_namespace');
|
38
vendor/owner888/phpspider/composer.json
vendored
Normal file
38
vendor/owner888/phpspider/composer.json
vendored
Normal file
@ -0,0 +1,38 @@
|
||||
{
|
||||
"name": "owner888/phpspider",
|
||||
"type": "library",
|
||||
"keywords": [
|
||||
"framework",
|
||||
"phpspider"
|
||||
],
|
||||
"homepage": "http://www.phpspider.org",
|
||||
"license": "MIT",
|
||||
"description": "The PHPSpider Framework.",
|
||||
"authors": [
|
||||
{
|
||||
"name": "Seatle Yang",
|
||||
"email": "seatle@foxmail.com",
|
||||
"homepage": "http://www.phpspider.org",
|
||||
"role": "Developer"
|
||||
}
|
||||
],
|
||||
"support": {
|
||||
"email": "seatle@foxmail.com",
|
||||
"issues": "https://github.com/owner888/phpspider/issues",
|
||||
"forum": "http://wenda.phpspider.org/",
|
||||
"wiki": "http://doc.phpspider.org/",
|
||||
"source": "https://github.com/owner888/phpspider"
|
||||
},
|
||||
"require": {
|
||||
"php": ">=5.5.0"
|
||||
},
|
||||
"suggest": {
|
||||
"ext-pcntl、ext-redis": "For better performance. "
|
||||
},
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"phpspider\\": "./"
|
||||
}
|
||||
},
|
||||
"minimum-stability": "dev"
|
||||
}
|
64
vendor/owner888/phpspider/core/cache.php
vendored
Normal file
64
vendor/owner888/phpspider/core/cache.php
vendored
Normal file
@ -0,0 +1,64 @@
|
||||
<?php
|
||||
// +----------------------------------------------------------------------
|
||||
// | PHPSpider [ A PHP Framework For Crawler ]
|
||||
// +----------------------------------------------------------------------
|
||||
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
|
||||
// +----------------------------------------------------------------------
|
||||
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
|
||||
// +----------------------------------------------------------------------
|
||||
// | Author: Seatle Yang <seatle@foxmail.com>
|
||||
// +----------------------------------------------------------------------
|
||||
|
||||
//----------------------------------
|
||||
// PHPSpider缓存类文件
|
||||
//----------------------------------
|
||||
|
||||
class cache
|
||||
{
|
||||
// 多进程下面不能用单例模式
|
||||
//protected static $_instance;
|
||||
/**
|
||||
* 获取实例
|
||||
*
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-04-10 22:55
|
||||
*/
|
||||
public static function init()
|
||||
{
|
||||
if(extension_loaded('Redis'))
|
||||
{
|
||||
$_instance = new Redis();
|
||||
}
|
||||
else
|
||||
{
|
||||
$errmsg = "extension redis is not installed";
|
||||
log::add($errmsg, "Error");
|
||||
return null;
|
||||
}
|
||||
// 这里不能用pconnect,会报错:Uncaught exception 'RedisException' with message 'read error on connection'
|
||||
$_instance->connect($GLOBALS['config']['redis']['host'], $GLOBALS['config']['redis']['port'], $GLOBALS['config']['redis']['timeout']);
|
||||
|
||||
// 验证
|
||||
if ($GLOBALS['config']['redis']['pass'])
|
||||
{
|
||||
if ( !$_instance->auth($GLOBALS['config']['redis']['pass']) )
|
||||
{
|
||||
$errmsg = "Redis Server authentication failed!!";
|
||||
log::add($errmsg, "Error");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// 不序列化的话不能存数组,用php的序列化方式其他语言又不能读取,所以这里自己用json序列化了,性能还比php的序列化好1.4倍
|
||||
//$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_NONE); // don't serialize data
|
||||
//$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_PHP); // use built-in serialize/unserialize
|
||||
//$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_IGBINARY); // use igBinary serialize/unserialize
|
||||
|
||||
$_instance->setOption(Redis::OPT_PREFIX, $GLOBALS['config']['redis']['prefix'] . ":");
|
||||
|
||||
return $_instance;
|
||||
}
|
||||
}
|
||||
|
||||
|
55
vendor/owner888/phpspider/core/constants.php
vendored
Normal file
55
vendor/owner888/phpspider/core/constants.php
vendored
Normal file
@ -0,0 +1,55 @@
|
||||
<?php
|
||||
// +----------------------------------------------------------------------
|
||||
// | PHPSpider [ A PHP Framework For Crawler ]
|
||||
// +----------------------------------------------------------------------
|
||||
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
|
||||
// +----------------------------------------------------------------------
|
||||
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
|
||||
// +----------------------------------------------------------------------
|
||||
// | Author: Seatle Yang <seatle@foxmail.com>
|
||||
// +----------------------------------------------------------------------
|
||||
|
||||
//----------------------------------
|
||||
// PHPSpider公共入口文件
|
||||
//----------------------------------
|
||||
|
||||
//namespace phpspider\core;
|
||||
|
||||
// Display errors.
|
||||
ini_set('display_errors', 'on');
|
||||
// Reporting all.
|
||||
error_reporting(E_ALL);
|
||||
|
||||
// 永不超时
|
||||
ini_set('max_execution_time', 0);
|
||||
set_time_limit(0);
|
||||
// 内存限制,如果外面设置的内存比 /etc/php/php-cli.ini 大,就不要设置了
|
||||
if (intval(ini_get("memory_limit")) < 1024)
|
||||
{
|
||||
ini_set('memory_limit', '1024M');
|
||||
}
|
||||
|
||||
if( PHP_SAPI != 'cli' )
|
||||
{
|
||||
exit("You must run the CLI environment\n");
|
||||
}
|
||||
|
||||
// Date.timezone
|
||||
if (!ini_get('date.timezone'))
|
||||
{
|
||||
date_default_timezone_set('Asia/Shanghai');
|
||||
}
|
||||
|
||||
//核心库目录
|
||||
define('CORE', dirname(__FILE__));
|
||||
define('PATH_ROOT', CORE."/../");
|
||||
define('PATH_DATA', CORE."/../data");
|
||||
define('PATH_LIBRARY', CORE."/../library");
|
||||
|
||||
//系统配置
|
||||
//if( file_exists( PATH_ROOT."/config/inc_config.php" ) )
|
||||
//{
|
||||
//require PATH_ROOT."/config/inc_config.php";
|
||||
//}
|
||||
|
||||
|
579
vendor/owner888/phpspider/core/db.php
vendored
Normal file
579
vendor/owner888/phpspider/core/db.php
vendored
Normal file
@ -0,0 +1,579 @@
|
||||
<?php
|
||||
// +----------------------------------------------------------------------
|
||||
// | PHPSpider [ A PHP Framework For Crawler ]
|
||||
// +----------------------------------------------------------------------
|
||||
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
|
||||
// +----------------------------------------------------------------------
|
||||
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
|
||||
// +----------------------------------------------------------------------
|
||||
// | Author: Seatle Yang <seatle@foxmail.com>
|
||||
// +----------------------------------------------------------------------
|
||||
|
||||
//----------------------------------
|
||||
// PHPSpider数据库类文件
|
||||
//----------------------------------
|
||||
|
||||
namespace phpspider\core;
|
||||
|
||||
class db
|
||||
{
|
||||
private static $configs = array();
|
||||
private static $rsid;
|
||||
private static $links = array();
|
||||
private static $link_name = 'default';
|
||||
private static $autocommiting = false;
|
||||
|
||||
public static function _init()
|
||||
{
|
||||
// 获取配置
|
||||
$config = self::$link_name == 'default' ? self::_get_default_config() : self::$configs[self::$link_name];
|
||||
|
||||
// 创建连接
|
||||
if (empty(self::$links[self::$link_name]) || empty(self::$links[self::$link_name]['conn']))
|
||||
{
|
||||
// 第一次连接,初始化fail和pid
|
||||
if (empty(self::$links[self::$link_name]))
|
||||
{
|
||||
self::$links[self::$link_name]['fail'] = 0;
|
||||
self::$links[self::$link_name]['pid'] = function_exists('posix_getpid') ? posix_getpid() : 0;
|
||||
//echo "progress[".self::$links[self::$link_name]['pid']."] create db connect[".self::$link_name."]\n";
|
||||
}
|
||||
self::$links[self::$link_name]['conn'] = mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']);
|
||||
if(mysqli_connect_errno())
|
||||
{
|
||||
self::$links[self::$link_name]['fail']++;
|
||||
$errmsg = 'Mysql Connect failed['.self::$links[self::$link_name]['fail'].']: ' . mysqli_connect_error();
|
||||
echo util::colorize(date("H:i:s") . " {$errmsg}\n\n", 'fail');
|
||||
log::add($errmsg, "Error");
|
||||
// 连接失败5次,中断进程
|
||||
if (self::$links[self::$link_name]['fail'] >= 5)
|
||||
{
|
||||
exit(250);
|
||||
}
|
||||
self::_init($config);
|
||||
}
|
||||
else
|
||||
{
|
||||
mysqli_query(self::$links[self::$link_name]['conn'], " SET character_set_connection=utf8, character_set_results=utf8, character_set_client=binary, sql_mode='' ");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
$curr_pid = function_exists('posix_getpid') ? posix_getpid() : 0;
|
||||
// 如果父进程已经生成资源就释放重新生成,因为多进程不能共享连接资源
|
||||
if (self::$links[self::$link_name]['pid'] != $curr_pid)
|
||||
{
|
||||
self::clear_link();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 重新设置连接
|
||||
* 传空的话就等于关闭数据库再连接
|
||||
* 在多进程环境下如果主进程已经调用过了,子进程一定要调用一次 clear_link,否则会报错:
|
||||
* Error while reading greeting packet. PID=19615,这是两个进程互抢一个连接句柄引起的
|
||||
*
|
||||
* @param array $config
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-03-29 00:51
|
||||
*/
|
||||
public static function clear_link()
|
||||
{
|
||||
if(self::$links)
|
||||
{
|
||||
foreach(self::$links as $k=>$v)
|
||||
{
|
||||
@mysqli_close($v['conn']);
|
||||
unset(self::$links[$k]);
|
||||
}
|
||||
}
|
||||
// 注意,只会连接最后一个,不过貌似也够用了啊
|
||||
self::_init();
|
||||
}
|
||||
|
||||
/**
|
||||
* 改变链接为指定配置的链接(如果不同时使用多个数据库,不会涉及这个操作)
|
||||
* @parem $link_name 链接标识名
|
||||
* @parem $config 多次使用时, 这个数组只需传递一次
|
||||
* config 格式与 $GLOBALS['config']['db'] 一致
|
||||
* @return void
|
||||
*/
|
||||
public static function set_connect($link_name, $config = array())
|
||||
{
|
||||
self::$link_name = $link_name;
|
||||
if (!empty($config))
|
||||
{
|
||||
self::$configs[self::$link_name] = $config;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (empty(self::$configs[self::$link_name]))
|
||||
{
|
||||
throw new Exception("You not set a config array for connect!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 还原为默认连接(如果不同时使用多个数据库,不会涉及这个操作)
|
||||
* @parem $config 指定配置(默认使用inc_config.php的配置)
|
||||
* @return void
|
||||
*/
|
||||
public static function set_connect_default()
|
||||
{
|
||||
$config = self::_get_default_config();
|
||||
self::set_connect('default', $config);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 获取默认配置
|
||||
*/
|
||||
protected static function _get_default_config()
|
||||
{
|
||||
if (empty(self::$configs['default']))
|
||||
{
|
||||
if (!is_array($GLOBALS['config']['db']))
|
||||
{
|
||||
exit('db.php _get_default_config()' . '没有mysql配置');
|
||||
}
|
||||
self::$configs['default'] = $GLOBALS['config']['db'];
|
||||
}
|
||||
return self::$configs['default'];
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回查询游标
|
||||
* @return rsid
|
||||
*/
|
||||
protected static function _get_rsid($rsid = '')
|
||||
{
|
||||
return $rsid == '' ? self::$rsid : $rsid;
|
||||
}
|
||||
|
||||
public static function autocommit($mode = false)
|
||||
{
|
||||
if ( self::$autocommiting )
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
self::$autocommiting = true;
|
||||
|
||||
self::_init();
|
||||
return mysqli_autocommit(self::$links[self::$link_name]['conn'], $mode);
|
||||
}
|
||||
|
||||
public static function begin_tran()
|
||||
{
|
||||
return self::autocommit(false);
|
||||
}
|
||||
|
||||
public static function commit()
|
||||
{
|
||||
mysqli_commit(self::$links[self::$link_name]['conn']);
|
||||
self::autocommit(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public static function rollback()
|
||||
{
|
||||
mysqli_rollback(self::$links[self::$link_name]['conn']);
|
||||
self::autocommit(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
public static function query($sql)
|
||||
{
|
||||
$sql = trim($sql);
|
||||
|
||||
// 初始化数据库
|
||||
self::_init();
|
||||
self::$rsid = @mysqli_query(self::$links[self::$link_name]['conn'], $sql);
|
||||
|
||||
if (self::$rsid === false)
|
||||
{
|
||||
// 不要每次都ping,浪费流量浪费性能,执行出错了才重新连接
|
||||
$errno = mysqli_errno(self::$links[self::$link_name]['conn']);
|
||||
if ($errno == 2013 || $errno == 2006)
|
||||
{
|
||||
$errmsg = mysqli_error(self::$links[self::$link_name]['conn']);
|
||||
log::add($errmsg, "Error");
|
||||
|
||||
@mysqli_close(self::$links[self::$link_name]['conn']);
|
||||
self::$links[self::$link_name]['conn'] = null;
|
||||
return self::query($sql);
|
||||
}
|
||||
|
||||
$errmsg = "Query SQL: ".$sql;
|
||||
log::add($errmsg, "Warning");
|
||||
$errmsg = "Error SQL: ".mysqli_error(self::$links[self::$link_name]['conn']);
|
||||
log::add($errmsg, "Warning");
|
||||
|
||||
$backtrace = debug_backtrace();
|
||||
array_shift($backtrace);
|
||||
$narr = array('class', 'type', 'function', 'file', 'line');
|
||||
$err = "debug_backtrace:\n";
|
||||
foreach($backtrace as $i => $l)
|
||||
{
|
||||
foreach($narr as $k)
|
||||
{
|
||||
if( !isset($l[$k]) )
|
||||
{
|
||||
$l[$k] = '';
|
||||
}
|
||||
}
|
||||
$err .= "[$i] in function {$l['class']}{$l['type']}{$l['function']} ";
|
||||
if($l['file']) $err .= " in {$l['file']} ";
|
||||
if($l['line']) $err .= " on line {$l['line']} ";
|
||||
$err .= "\n";
|
||||
}
|
||||
log::add($err);
|
||||
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
return self::$rsid;
|
||||
}
|
||||
}
|
||||
|
||||
public static function fetch($rsid = '')
|
||||
{
|
||||
$rsid = self::_get_rsid($rsid);
|
||||
$row = mysqli_fetch_array($rsid, MYSQLI_ASSOC);
|
||||
return $row;
|
||||
}
|
||||
|
||||
public static function get_one($sql)
|
||||
{
|
||||
if (!preg_match("/limit/i", $sql))
|
||||
{
|
||||
$sql = preg_replace("/[,;]$/i", '', trim($sql)) . " limit 1 ";
|
||||
}
|
||||
$rsid = self::query($sql);
|
||||
if ($rsid === false)
|
||||
{
|
||||
return array();
|
||||
}
|
||||
$row = self::fetch($rsid);
|
||||
self::free($rsid);
|
||||
return $row;
|
||||
}
|
||||
|
||||
public static function get_all($sql)
|
||||
{
|
||||
$rsid = self::query($sql);
|
||||
if ($rsid === false)
|
||||
{
|
||||
return array();
|
||||
}
|
||||
while ( $row = self::fetch($rsid) )
|
||||
{
|
||||
$rows[] = $row;
|
||||
}
|
||||
self::free($rsid);
|
||||
return empty($rows) ? false : $rows;
|
||||
}
|
||||
|
||||
public static function free($rsid)
|
||||
{
|
||||
return mysqli_free_result($rsid);
|
||||
}
|
||||
|
||||
public static function insert_id()
|
||||
{
|
||||
return mysqli_insert_id(self::$links[self::$link_name]['conn']);
|
||||
}
|
||||
|
||||
public static function affected_rows()
|
||||
{
|
||||
return mysqli_affected_rows(self::$links[self::$link_name]['conn']);
|
||||
}
|
||||
|
||||
public static function insert($table = '', $data = null, $return_sql = false)
|
||||
{
|
||||
$items_sql = $values_sql = "";
|
||||
foreach ($data as $k => $v)
|
||||
{
|
||||
$v = stripslashes($v);
|
||||
$v = addslashes($v);
|
||||
$items_sql .= "`$k`,";
|
||||
$values_sql .= "\"$v\",";
|
||||
}
|
||||
$sql = "Insert Ignore Into `{$table}` (" . substr($items_sql, 0, -1) . ") Values (" . substr($values_sql, 0, -1) . ")";
|
||||
if ($return_sql)
|
||||
{
|
||||
return $sql;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (self::query($sql))
|
||||
{
|
||||
return mysqli_insert_id(self::$links[self::$link_name]['conn']);
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static function insert_batch($table = '', $set = NULL, $return_sql = FALSE)
|
||||
{
|
||||
if (empty($table) || empty($set))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
$set = self::strsafe($set);
|
||||
$fields = self::get_fields($table);
|
||||
|
||||
$keys_sql = $vals_sql = array();
|
||||
foreach ($set as $i=>$val)
|
||||
{
|
||||
ksort($val);
|
||||
$vals = array();
|
||||
foreach ($val as $k => $v)
|
||||
{
|
||||
// 过滤掉数据库没有的字段
|
||||
if (!in_array($k, $fields))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// 如果是第一个数组,把key当做插入条件
|
||||
if ($i == 0 && $k == 0)
|
||||
{
|
||||
$keys_sql[] = "`$k`";
|
||||
}
|
||||
$vals[] = "\"$v\"";
|
||||
}
|
||||
$vals_sql[] = implode(",", $vals);
|
||||
}
|
||||
|
||||
$sql = "Insert Ignore Into `{$table}`(".implode(", ", $keys_sql).") Values (".implode("), (", $vals_sql).")";
|
||||
|
||||
if ($return_sql) return $sql;
|
||||
|
||||
$rt = self::query($sql);
|
||||
$insert_id = self::insert_id();
|
||||
$return = empty($insert_id) ? $rt : $insert_id;
|
||||
return $return;
|
||||
}
|
||||
|
||||
public static function update_batch($table = '', $set = NULL, $index = NULL, $where = NULL, $return_sql = FALSE)
|
||||
{
|
||||
if (empty($table) || is_null($set) || is_null($index))
|
||||
{
|
||||
// 不要用exit,会中断程序
|
||||
return false;
|
||||
}
|
||||
$set = self::strsafe($set);
|
||||
$fields = self::get_fields($table);
|
||||
|
||||
$ids = array();
|
||||
foreach ($set as $val)
|
||||
{
|
||||
ksort($val);
|
||||
// 去重,其实不去也可以,因为相同的when只会执行第一个,后面的就直接跳过不执行了
|
||||
$key = md5($val[$index]);
|
||||
$ids[$key] = $val[$index];
|
||||
|
||||
foreach (array_keys($val) as $field)
|
||||
{
|
||||
if ($field != $index)
|
||||
{
|
||||
$final[$field][$key] = 'When `'.$index.'` = "'.$val[$index].'" Then "'.$val[$field].'"';
|
||||
}
|
||||
}
|
||||
}
|
||||
//$ids = array_values($ids);
|
||||
|
||||
// 如果不是数组而且不为空,就转数组
|
||||
if (!is_array($where) && !empty($where))
|
||||
{
|
||||
$where = array($where);
|
||||
}
|
||||
$where[] = $index.' In ("'.implode('","', $ids).'")';
|
||||
$where = empty($where) ? "" : " Where ".implode(" And ", $where);
|
||||
|
||||
$sql = "Update `".$table."` Set ";
|
||||
$cases = '';
|
||||
|
||||
foreach ($final as $k => $v)
|
||||
{
|
||||
// 过滤掉数据库没有的字段
|
||||
if (!in_array($k, $fields))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
$cases .= '`'.$k.'` = Case '."\n";
|
||||
foreach ($v as $row)
|
||||
{
|
||||
$cases .= $row."\n";
|
||||
}
|
||||
|
||||
$cases .= 'Else `'.$k.'` End, ';
|
||||
}
|
||||
|
||||
$sql .= substr($cases, 0, -2);
|
||||
|
||||
// 其实不带 Where In ($index) 的条件也可以的
|
||||
$sql .= $where;
|
||||
|
||||
if ($return_sql) return $sql;
|
||||
|
||||
$rt = self::query($sql);
|
||||
$insert_id = self::affected_rows();
|
||||
$return = empty($affected_rows) ? $rt : $affected_rows;
|
||||
return $return;
|
||||
}
|
||||
|
||||
public static function update($table = '', $data = array(), $where = null, $return_sql = false)
|
||||
{
|
||||
$sql = "UPDATE `{$table}` SET ";
|
||||
foreach ($data as $k => $v)
|
||||
{
|
||||
$v = stripslashes($v);
|
||||
$v = addslashes($v);
|
||||
$sql .= "`{$k}` = \"{$v}\",";
|
||||
}
|
||||
if (!is_array($where))
|
||||
{
|
||||
$where = array($where);
|
||||
}
|
||||
// 删除空字段,不然array("")会成为WHERE
|
||||
foreach ($where as $k => $v)
|
||||
{
|
||||
if (empty($v))
|
||||
{
|
||||
unset($where[$k]);
|
||||
}
|
||||
}
|
||||
$where = empty($where) ? "" : " Where " . implode(" And ", $where);
|
||||
$sql = substr($sql, 0, -1) . $where;
|
||||
if ($return_sql)
|
||||
{
|
||||
return $sql;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (self::query($sql))
|
||||
{
|
||||
return mysqli_affected_rows(self::$links[self::$link_name]['conn']);
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static function delete($table = '', $where = null, $return_sql = false)
|
||||
{
|
||||
// 小心全部被删除了
|
||||
if (empty($where))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
$where = 'Where ' . (!is_array($where) ? $where : implode(' And ', $where));
|
||||
$sql = "Delete From `{$table}` {$where}";
|
||||
if ($return_sql)
|
||||
{
|
||||
return $sql;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (self::query($sql))
|
||||
{
|
||||
return mysqli_affected_rows(self::$links[self::$link_name]['conn']);
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static function ping()
|
||||
{
|
||||
if (!mysqli_ping(self::$links[self::$link_name]['conn']))
|
||||
{
|
||||
@mysqli_close(self::$links[self::$link_name]['conn']);
|
||||
self::$links[self::$link_name]['conn'] = null;
|
||||
self::_init();
|
||||
}
|
||||
}
|
||||
|
||||
public static function strsafe($array)
|
||||
{
|
||||
$arrays = array();
|
||||
if(is_array($array)===true)
|
||||
{
|
||||
foreach ($array as $key => $val)
|
||||
{
|
||||
if(is_array($val)===true)
|
||||
{
|
||||
$arrays[$key] = self::strsafe($val);
|
||||
}
|
||||
else
|
||||
{
|
||||
//先去掉转义,避免下面重复转义了
|
||||
$val = stripslashes($val);
|
||||
//进行转义
|
||||
$val = addslashes($val);
|
||||
//处理addslashes没法处理的 _ % 字符
|
||||
//$val = strtr($val, array('_'=>'\_', '%'=>'\%'));
|
||||
$arrays[$key] = $val;
|
||||
}
|
||||
}
|
||||
return $arrays;
|
||||
}
|
||||
else
|
||||
{
|
||||
$array = stripslashes($array);
|
||||
$array = addslashes($array);
|
||||
//$array = strtr($array, array('_'=>'\_', '%'=>'\%'));
|
||||
return $array;
|
||||
}
|
||||
}
|
||||
|
||||
// 这个是给insert、update、insert_batch、update_batch用的
|
||||
public static function get_fields($table)
|
||||
{
|
||||
// $sql = "SHOW COLUMNS FROM $table"; //和下面的语句效果一样
|
||||
$rows = self::get_all("Desc `{$table}`");
|
||||
$fields = array();
|
||||
foreach ($rows as $k => $v)
|
||||
{
|
||||
// 过滤自增主键
|
||||
// if ($v['Key'] != 'PRI')
|
||||
if ($v['Extra'] != 'auto_increment')
|
||||
{
|
||||
$fields[] = $v['Field'];
|
||||
}
|
||||
}
|
||||
return $fields;
|
||||
}
|
||||
|
||||
public static function table_exists($table_name)
|
||||
{
|
||||
$sql = "SHOW TABLES LIKE '" . $table_name . "'";
|
||||
$rsid = self::query($sql);
|
||||
$table = self::fetch($rsid);
|
||||
if (empty($table))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
101
vendor/owner888/phpspider/core/init.php
vendored
Normal file
101
vendor/owner888/phpspider/core/init.php
vendored
Normal file
@ -0,0 +1,101 @@
|
||||
<?php
|
||||
// +----------------------------------------------------------------------
|
||||
// | PHPSpider [ A PHP Framework For Crawler ]
|
||||
// +----------------------------------------------------------------------
|
||||
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
|
||||
// +----------------------------------------------------------------------
|
||||
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
|
||||
// +----------------------------------------------------------------------
|
||||
// | Author: Seatle Yang <seatle@foxmail.com>
|
||||
// +----------------------------------------------------------------------
|
||||
|
||||
//----------------------------------
|
||||
// PHPSpider公共入口文件
|
||||
//----------------------------------
|
||||
|
||||
// 严格开发模式
|
||||
error_reporting( E_ALL );
|
||||
//ini_set('display_errors', 1);
|
||||
|
||||
// 永不超时
|
||||
ini_set('max_execution_time', 0);
|
||||
set_time_limit(0);
|
||||
// 内存限制,如果外面设置的内存比 /etc/php/php-cli.ini 大,就不要设置了
|
||||
if (intval(ini_get("memory_limit")) < 1024)
|
||||
{
|
||||
ini_set('memory_limit', '1024M');
|
||||
}
|
||||
|
||||
if( PHP_SAPI != 'cli' )
|
||||
{
|
||||
exit("You must run the CLI environment\n");
|
||||
}
|
||||
|
||||
// 设置时区
|
||||
date_default_timezone_set('Asia/Shanghai');
|
||||
|
||||
// 引入PATH_DATA
|
||||
require_once __DIR__ . '/constants.php';
|
||||
// 核心库目录
|
||||
define('CORE', dirname(__FILE__));
|
||||
define('PATH_ROOT', CORE."/../");
|
||||
define('PATH_DATA', CORE."/../data");
|
||||
define('PATH_LIBRARY', CORE."/../library");
|
||||
|
||||
// 系统配置
|
||||
if( file_exists( PATH_ROOT."/config/inc_config.php" ) )
|
||||
{
|
||||
require PATH_ROOT."/config/inc_config.php";
|
||||
}
|
||||
require CORE.'/log.php';
|
||||
require CORE.'/requests.php';
|
||||
require CORE.'/selector.php';
|
||||
require CORE.'/util.php';
|
||||
require CORE.'/db.php';
|
||||
require CORE.'/cache.php';
|
||||
require CORE."/worker.php";
|
||||
require CORE."/phpspider.php";
|
||||
|
||||
// 启动的时候生成data目录
|
||||
util::path_exists(PATH_DATA);
|
||||
util::path_exists(PATH_DATA."/lock");
|
||||
util::path_exists(PATH_DATA."/log");
|
||||
util::path_exists(PATH_DATA."/cache");
|
||||
util::path_exists(PATH_DATA."/status");
|
||||
|
||||
function autoload($classname) {
|
||||
set_include_path(PATH_ROOT.'/library/');
|
||||
spl_autoload($classname); //replaces include/require
|
||||
}
|
||||
|
||||
spl_autoload_extensions('.php');
|
||||
spl_autoload_register('autoload');
|
||||
|
||||
/**
|
||||
* 自动加载类库处理
|
||||
* @return void
|
||||
*/
|
||||
//function __autoload( $classname )
|
||||
//{
|
||||
//$classname = preg_replace("/[^0-9a-z_]/i", '', $classname);
|
||||
//if( class_exists ( $classname ) ) {
|
||||
//return true;
|
||||
//}
|
||||
//$classfile = $classname.'.php';
|
||||
//try
|
||||
//{
|
||||
//if ( file_exists ( PATH_LIBRARY.'/'.$classfile ) )
|
||||
//{
|
||||
//require PATH_LIBRARY.'/'.$classfile;
|
||||
//}
|
||||
//else
|
||||
//{
|
||||
//throw new Exception ( 'Error: Cannot find the '.$classname );
|
||||
//}
|
||||
//}
|
||||
//catch ( Exception $e )
|
||||
//{
|
||||
//log::error($e->getMessage().'|'.$classname);
|
||||
//exit();
|
||||
//}
|
||||
//}
|
119
vendor/owner888/phpspider/core/log.php
vendored
Normal file
119
vendor/owner888/phpspider/core/log.php
vendored
Normal file
@ -0,0 +1,119 @@
|
||||
<?php
|
||||
// +----------------------------------------------------------------------
|
||||
// | PHPSpider [ A PHP Framework For Crawler ]
|
||||
// +----------------------------------------------------------------------
|
||||
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
|
||||
// +----------------------------------------------------------------------
|
||||
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
|
||||
// +----------------------------------------------------------------------
|
||||
// | Author: Seatle Yang <seatle@foxmail.com>
|
||||
// +----------------------------------------------------------------------
|
||||
|
||||
//----------------------------------
|
||||
// PHPSpider日志类文件
|
||||
//----------------------------------
|
||||
|
||||
namespace phpspider\core;
|
||||
// 引入PATH_DATA
|
||||
require_once __DIR__ . '/constants.php';
|
||||
|
||||
class log
|
||||
{
|
||||
public static $log_show = false;
|
||||
public static $log_type = false;
|
||||
public static $log_file = "data/phpspider.log";
|
||||
public static $out_sta = "";
|
||||
public static $out_end = "";
|
||||
|
||||
public static function note($msg)
|
||||
{
|
||||
self::$out_sta = self::$out_end = "";
|
||||
self::msg($msg, 'note');
|
||||
}
|
||||
|
||||
public static function info($msg)
|
||||
{
|
||||
self::$out_sta = self::$out_end = "";
|
||||
self::msg($msg, 'info');
|
||||
}
|
||||
|
||||
public static function warn($msg)
|
||||
{
|
||||
self::$out_sta = self::$out_end = "";
|
||||
if (!util::is_win())
|
||||
{
|
||||
self::$out_sta = "\033[33m";
|
||||
self::$out_end = "\033[0m";
|
||||
}
|
||||
|
||||
self::msg($msg, 'warn');
|
||||
}
|
||||
|
||||
public static function debug($msg)
|
||||
{
|
||||
self::$out_sta = self::$out_end = "";
|
||||
if (!util::is_win())
|
||||
{
|
||||
self::$out_sta = "\033[36m";
|
||||
self::$out_end = "\033[0m";
|
||||
}
|
||||
|
||||
self::msg($msg, 'debug');
|
||||
}
|
||||
|
||||
public static function error($msg)
|
||||
{
|
||||
self::$out_sta = self::$out_end = "";
|
||||
if (!util::is_win())
|
||||
{
|
||||
self::$out_sta = "\033[31m";
|
||||
self::$out_end = "\033[0m";
|
||||
}
|
||||
|
||||
self::msg($msg, 'error');
|
||||
}
|
||||
|
||||
public static function msg($msg, $log_type)
|
||||
{
|
||||
if ($log_type != 'note' && self::$log_type && strpos(self::$log_type, $log_type) === false)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($log_type == 'note')
|
||||
{
|
||||
$msg = self::$out_sta. $msg . "\n".self::$out_end;
|
||||
}
|
||||
else
|
||||
{
|
||||
$msg = self::$out_sta.date("Y-m-d H:i:s")." [{$log_type}] " . $msg .self::$out_end. "\n";
|
||||
}
|
||||
if(self::$log_show)
|
||||
{
|
||||
echo $msg;
|
||||
}
|
||||
file_put_contents(self::$log_file, $msg, FILE_APPEND | LOCK_EX);
|
||||
}
|
||||
|
||||
/**
|
||||
* 记录日志 XXX
|
||||
* @param string $msg
|
||||
* @param string $log_type Note|Warning|Error
|
||||
* @return void
|
||||
*/
|
||||
public static function add($msg, $log_type = '')
|
||||
{
|
||||
if ($log_type != '')
|
||||
{
|
||||
$msg = date("Y-m-d H:i:s")." [{$log_type}] " . $msg . "\n";
|
||||
}
|
||||
if(self::$log_show)
|
||||
{
|
||||
echo $msg;
|
||||
}
|
||||
//file_put_contents(PATH_DATA."/log/".strtolower($log_type).".log", $msg, FILE_APPEND | LOCK_EX);
|
||||
file_put_contents(PATH_DATA."/log/error.log", $msg, FILE_APPEND | LOCK_EX);
|
||||
}
|
||||
|
||||
}
|
||||
|
2870
vendor/owner888/phpspider/core/phpspider.bak20170807.php
vendored
Normal file
2870
vendor/owner888/phpspider/core/phpspider.bak20170807.php
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3598
vendor/owner888/phpspider/core/phpspider.php
vendored
Normal file
3598
vendor/owner888/phpspider/core/phpspider.php
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1388
vendor/owner888/phpspider/core/queue.php
vendored
Normal file
1388
vendor/owner888/phpspider/core/queue.php
vendored
Normal file
File diff suppressed because it is too large
Load Diff
998
vendor/owner888/phpspider/core/requests.php
vendored
Normal file
998
vendor/owner888/phpspider/core/requests.php
vendored
Normal file
@ -0,0 +1,998 @@
|
||||
<?php
|
||||
// +----------------------------------------------------------------------
|
||||
// | PHPSpider [ A PHP Framework For Crawler ]
|
||||
// +----------------------------------------------------------------------
|
||||
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
|
||||
// +----------------------------------------------------------------------
|
||||
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
|
||||
// +----------------------------------------------------------------------
|
||||
// | Author: Seatle Yang <seatle@foxmail.com>
|
||||
// +----------------------------------------------------------------------
|
||||
|
||||
// +----------------------------------------------------------------------
|
||||
// | GET请求
|
||||
// | requests::get('http://www.test.com');
|
||||
// | SERVER
|
||||
// | $_GET
|
||||
// +----------------------------------------------------------------------
|
||||
// | POST请求
|
||||
// | $data = array('name'=>'request');
|
||||
// | requests::post('http://www.test.com', $data);
|
||||
// | SERVER
|
||||
// | $_POST
|
||||
// +----------------------------------------------------------------------
|
||||
// | POST RESTful请求
|
||||
// | $data = array('name'=>'request');
|
||||
// | $data_string = json_encode($data);
|
||||
// | requests::set_header("Content-Type", "application/json");
|
||||
// | requests::post('http://www.test.com', $data_string);
|
||||
// | SERVER
|
||||
// | file_get_contents('php://input')
|
||||
// +----------------------------------------------------------------------
|
||||
// | POST 文件上传
|
||||
// | $data = array('file1'=>''./data/phpspider.log'');
|
||||
// | requests::post('http://www.test.com', null, $data);
|
||||
// | SERVER
|
||||
// | $_FILES
|
||||
// +----------------------------------------------------------------------
|
||||
// | 代理
|
||||
// | requests::set_proxy(array('223.153.69.150:42354'));
|
||||
// | $html = requests::get('https://www.test.com');
|
||||
// +----------------------------------------------------------------------
|
||||
|
||||
//----------------------------------
|
||||
// PHPSpider请求类文件
|
||||
//----------------------------------
|
||||
|
||||
namespace phpspider\core;
|
||||
|
||||
if (!function_exists('curl_file_create'))
|
||||
{
|
||||
function curl_file_create($filename, $mimetype = '', $postname = '')
|
||||
{
|
||||
return "@$filename;filename="
|
||||
. ($postname ?: basename($filename))
|
||||
. ($mimetype ? ";type=$mimetype" : '');
|
||||
}
|
||||
}
|
||||
|
||||
class requests
|
||||
{
|
||||
const VERSION = '2.0.1';
|
||||
|
||||
protected static $ch = null;
|
||||
|
||||
/**** Public variables ****/
|
||||
|
||||
/* user definable vars */
|
||||
|
||||
public static $timeout = 15;
|
||||
public static $encoding = null;
|
||||
public static $input_encoding = null;
|
||||
public static $output_encoding = null;
|
||||
public static $cookies = array(); // array of cookies to pass
|
||||
// $cookies['username'] = "seatle";
|
||||
public static $rawheaders = array(); // array of raw headers to send
|
||||
public static $domain_cookies = array(); // array of cookies for domain to pass
|
||||
public static $hosts = array(); // random host binding for make request faster
|
||||
public static $headers = array(); // headers returned from server sent here
|
||||
public static $useragents = array("requests/2.0.0"); // random agent we masquerade as
|
||||
public static $client_ips = array(); // random ip we masquerade as
|
||||
public static $proxies = array(); // random proxy ip
|
||||
public static $raw = ""; // head + body content returned from server sent here
|
||||
public static $head = ""; // head content
|
||||
public static $content = ""; // The body before encoding
|
||||
public static $text = ""; // The body after encoding
|
||||
public static $info = array(); // curl info
|
||||
public static $history = 302; // http request status before redirect. ex:30x
|
||||
public static $status_code = 0; // http request status
|
||||
public static $error = ""; // error messages sent here
|
||||
|
||||
/**
|
||||
* set timeout
|
||||
* $timeout 为数组时会分别设置connect和read
|
||||
*
|
||||
* @param init or array $timeout
|
||||
* @return
|
||||
*/
|
||||
public static function set_timeout($timeout)
|
||||
{
|
||||
self::$timeout = $timeout;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置代理
|
||||
* 如果代理有多个,请求时会随机使用
|
||||
*
|
||||
* @param mixed $proxies
|
||||
* array (
|
||||
* 'socks5://user1:pass2@host:port',
|
||||
* 'socks5://user2:pass2@host:port'
|
||||
*)
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public static function set_proxy($proxy)
|
||||
{
|
||||
self::$proxies = is_array($proxy) ? $proxy : array($proxy);
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除代理
|
||||
* 因为每个链接信息里面都有代理信息,有的链接需要,有的不需要,所以必须提供一个删除功能
|
||||
*
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2018-07-16 17:59
|
||||
*/
|
||||
public static function del_proxy()
|
||||
{
|
||||
self::$proxies = array();
|
||||
}
|
||||
|
||||
/**
|
||||
* 自定义请求头部
|
||||
* 请求头内容可以用 requests::$rawheaders 来获取
|
||||
* 比如获取Content-Type:requests::$rawheaders['Content-Type']
|
||||
*
|
||||
* @param string $headers
|
||||
* @return void
|
||||
*/
|
||||
public static function set_header($key, $value)
|
||||
{
|
||||
self::$rawheaders[$key] = $value;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置全局COOKIE
|
||||
*
|
||||
* @param string $cookie
|
||||
* @return void
|
||||
*/
|
||||
public static function set_cookie($key, $value, $domain = '')
|
||||
{
|
||||
if (empty($key))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (!empty($domain))
|
||||
{
|
||||
self::$domain_cookies[$domain][$key] = $value;
|
||||
}
|
||||
else
|
||||
{
|
||||
self::$cookies[$key] = $value;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量设置全局cookie
|
||||
*
|
||||
* @param mixed $cookies
|
||||
* @param string $domain
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2017-08-03 18:06
|
||||
*/
|
||||
public static function set_cookies($cookies, $domain = '')
|
||||
{
|
||||
$cookies_arr = explode(';', $cookies);
|
||||
if (empty($cookies_arr))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
foreach ($cookies_arr as $cookie)
|
||||
{
|
||||
$cookie_arr = explode('=', $cookie, 2);
|
||||
$key = $cookie_arr[0];
|
||||
$value = empty($cookie_arr[1]) ? '' : $cookie_arr[1];
|
||||
|
||||
if (!empty($domain))
|
||||
{
|
||||
self::$domain_cookies[$domain][$key] = $value;
|
||||
}
|
||||
else
|
||||
{
|
||||
self::$cookies[$key] = $value;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取单一Cookie
|
||||
*
|
||||
* @param mixed $name cookie名称
|
||||
* @param string $domain 不传则取全局cookie,就是手动set_cookie的cookie
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2017-08-03 18:06
|
||||
*/
|
||||
public static function get_cookie($name, $domain = '')
|
||||
{
|
||||
if (!empty($domain) && !isset(self::$domain_cookies[$domain]))
|
||||
{
|
||||
return '';
|
||||
}
|
||||
$cookies = empty($domain) ? self::$cookies : self::$domain_cookies[$domain];
|
||||
return isset($cookies[$name]) ? $cookies[$name] : '';
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取Cookie数组
|
||||
*
|
||||
* @param string $domain 不传则取全局cookie,就是手动set_cookie的cookie
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2017-08-03 18:06
|
||||
*/
|
||||
public static function get_cookies($domain = '')
|
||||
{
|
||||
if (!empty($domain) && !isset(self::$domain_cookies[$domain]))
|
||||
{
|
||||
return array();
|
||||
}
|
||||
return empty($domain) ? self::$cookies : self::$domain_cookies[$domain];
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除Cookie
|
||||
*
|
||||
* @param string $domain 不传则删除全局Cookie
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2017-08-03 18:06
|
||||
*/
|
||||
public static function del_cookie($key, $domain = '')
|
||||
{
|
||||
if (empty($key))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!empty($domain) && !isset(self::$domain_cookies[$domain]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!empty($domain))
|
||||
{
|
||||
if (isset(self::$domain_cookies[$domain][$key]))
|
||||
{
|
||||
unset(self::$domain_cookies[$domain][$key]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (isset(self::$cookies[$key]))
|
||||
{
|
||||
unset(self::$cookies[$key]);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除Cookie
|
||||
*
|
||||
* @param string $domain 不传则删除全局Cookie
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2017-08-03 18:06
|
||||
*/
|
||||
public static function del_cookies($domain = '')
|
||||
{
|
||||
if (!empty($domain) && !isset(self::$domain_cookies[$domain]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if ( empty($domain) )
|
||||
{
|
||||
self::$cookies = array();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (isset(self::$domain_cookies[$domain]))
|
||||
{
|
||||
unset(self::$domain_cookies[$domain]);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置随机的user_agent
|
||||
*
|
||||
* @param string $useragent
|
||||
* @return void
|
||||
*/
|
||||
public static function set_useragent($useragent)
|
||||
{
|
||||
self::$useragents = is_array($useragent) ? $useragent : array($useragent);
|
||||
}
|
||||
|
||||
/**
|
||||
* set referer
|
||||
*
|
||||
*/
|
||||
public static function set_referer($referer)
|
||||
{
|
||||
self::$rawheaders['Referer'] = $referer;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置伪造IP
|
||||
* 传入数组则为随机IP
|
||||
* @param string $ip
|
||||
* @return void
|
||||
*/
|
||||
public static function set_client_ip($ip)
|
||||
{
|
||||
self::$client_ips = is_array($ip) ? $ip : array($ip);
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除伪造IP
|
||||
*
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2018-07-16 17:59
|
||||
*/
|
||||
public static function del_client_ip()
|
||||
{
|
||||
self::$client_ips = array();
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置中文请求
|
||||
*
|
||||
* @param string $lang
|
||||
* @return void
|
||||
*/
|
||||
public static function set_accept_language($lang = 'zh-CN')
|
||||
{
|
||||
self::$rawheaders['Accept-Language'] = $lang;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Hosts
|
||||
* 负载均衡到不同的服务器,如果对方使用CDN,采用这个是最好的了
|
||||
*
|
||||
* @param string $hosts
|
||||
* @return void
|
||||
*/
|
||||
public static function set_hosts($host, $ips = array())
|
||||
{
|
||||
$ips = is_array($ips) ? $ips : array($ips);
|
||||
self::$hosts[$host] = $ips;
|
||||
}
|
||||
|
||||
/**
|
||||
* 分割返回的header和body
|
||||
* header用来判断编码和获取Cookie
|
||||
* body用来判断编码,得到编码前和编码后的内容
|
||||
*
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2017-08-03 18:06
|
||||
*/
|
||||
public static function split_header_body()
|
||||
{
|
||||
$head = $body = '';
|
||||
$head = substr(self::$raw, 0, self::$info['header_size']);
|
||||
$body = substr(self::$raw, self::$info['header_size']);
|
||||
// http header
|
||||
self::$head = $head;
|
||||
// The body before encoding
|
||||
self::$content = $body;
|
||||
|
||||
//$http_headers = array();
|
||||
//// 解析HTTP数据流
|
||||
//if (!empty(self::$raw))
|
||||
//{
|
||||
//self::get_response_cookies($domain);
|
||||
//// body里面可能有 \r\n\r\n,但是第一个一定是HTTP Header,去掉后剩下的就是body
|
||||
//$array = explode("\r\n\r\n", self::$raw);
|
||||
//foreach ($array as $k=>$v)
|
||||
//{
|
||||
//// post 方法会有两个http header:HTTP/1.1 100 Continue、HTTP/1.1 200 OK
|
||||
//if (preg_match("#^HTTP/.*? 100 Continue#", $v))
|
||||
//{
|
||||
//unset($array[$k]);
|
||||
//continue;
|
||||
//}
|
||||
//if (preg_match("#^HTTP/.*? \d+ #", $v))
|
||||
//{
|
||||
//$header = $v;
|
||||
//unset($array[$k]);
|
||||
//$http_headers = self::get_response_headers($v);
|
||||
//}
|
||||
//}
|
||||
//$body = implode("\r\n\r\n", $array);
|
||||
//}
|
||||
|
||||
// 设置了输出编码的转码,注意: xpath只支持utf-8,iso-8859-1 不要转,他本身就是utf-8
|
||||
$body = self::encoding($body); //自动转码
|
||||
// 转码后
|
||||
self::$encoding = self::$output_encoding;
|
||||
|
||||
// The body after encoding
|
||||
self::$text = $body;
|
||||
return array($head, $body);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得域名相对应的Cookie
|
||||
*
|
||||
* @param mixed $header
|
||||
* @param mixed $domain
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2017-08-03 18:06
|
||||
*/
|
||||
public static function get_response_cookies($header, $domain)
|
||||
{
|
||||
// 解析Cookie并存入 self::$cookies 方便调用
|
||||
preg_match_all("/.*?Set\-Cookie: ([^\r\n]*)/i", $header, $matches);
|
||||
$cookies = empty($matches[1]) ? array() : $matches[1];
|
||||
|
||||
// 解析到Cookie
|
||||
if (!empty($cookies))
|
||||
{
|
||||
$cookies = implode(';', $cookies);
|
||||
$cookies = explode(';', $cookies);
|
||||
foreach ($cookies as $cookie)
|
||||
{
|
||||
$cookie_arr = explode('=', $cookie, 2);
|
||||
// 过滤 httponly、secure
|
||||
if (count($cookie_arr) < 2)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
$cookie_name = !empty($cookie_arr[0]) ? trim($cookie_arr[0]) : '';
|
||||
if (empty($cookie_name))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// 过滤掉domain路径
|
||||
if (in_array(strtolower($cookie_name), array('path', 'domain', 'expires', 'max-age')))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
self::$domain_cookies[$domain][trim($cookie_arr[0])] = trim($cookie_arr[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得response header
|
||||
* 此方法占时没有用到
|
||||
*
|
||||
* @param mixed $header
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2017-08-03 18:06
|
||||
*/
|
||||
public static function get_response_headers($header)
|
||||
{
|
||||
$headers = array();
|
||||
$header_lines = explode("\n", $header);
|
||||
if (!empty($header_lines))
|
||||
{
|
||||
foreach ($header_lines as $line)
|
||||
{
|
||||
$header_arr = explode(':', $line, 2);
|
||||
$key = empty($header_arr[0]) ? '' : trim($header_arr[0]);
|
||||
$val = empty($header_arr[1]) ? '' : trim($header_arr[1]);
|
||||
if (empty($key) || empty($val))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
$headers[$key] = $val;
|
||||
}
|
||||
}
|
||||
self::$headers = $headers;
|
||||
return self::$headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取编码
|
||||
* @param $string
|
||||
* @return string
|
||||
*/
|
||||
public static function get_encoding($string)
|
||||
{
|
||||
$encoding = mb_detect_encoding($string, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5', 'ISO-8859-1'));
|
||||
return strtolower($encoding);
|
||||
}
|
||||
|
||||
/**
|
||||
* 移除页面head区域代码
|
||||
* @param $html
|
||||
* @return mixed
|
||||
*/
|
||||
private static function _remove_head($html)
|
||||
{
|
||||
return preg_replace('/<head.+?>.+<\/head>/is', '<head></head>', $html);
|
||||
}
|
||||
|
||||
/**
|
||||
* 简单的判断一下参数是否为一个URL链接
|
||||
* @param string $str
|
||||
* @return boolean
|
||||
*/
|
||||
private static function _is_url($url)
|
||||
{
|
||||
//$pattern = '/^http(s)?:\\/\\/.+/';
|
||||
$pattern = "/\b(([\w-]+:\/\/?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|\/)))/";
|
||||
if (preg_match($pattern, $url))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 初始化 CURL
|
||||
*
|
||||
*/
|
||||
public static function init()
|
||||
{
|
||||
if (!is_resource ( self::$ch ))
|
||||
{
|
||||
self::$ch = curl_init ();
|
||||
curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true );
|
||||
curl_setopt( self::$ch, CURLOPT_HEADER, false );
|
||||
curl_setopt( self::$ch, CURLOPT_USERAGENT, "phpspider-requests/".self::VERSION );
|
||||
// 如果设置了两个时间,就分开设置
|
||||
if (is_array(self::$timeout))
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout[0] );
|
||||
curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
curl_setopt(self::$ch, CURLOPT_CONNECTTIMEOUT, ceil(self::$timeout / 2));
|
||||
curl_setopt(self::$ch, CURLOPT_TIMEOUT, self::$timeout);
|
||||
}
|
||||
curl_setopt(self::$ch, CURLOPT_MAXREDIRS, 5); //maximum number of redirects allowed
|
||||
// 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生
|
||||
curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true);
|
||||
}
|
||||
return self::$ch;
|
||||
}
|
||||
|
||||
/**
|
||||
* get 请求
|
||||
*/
|
||||
public static function get($url, $fields = array(), $allow_redirects = true, $cert = NULL)
|
||||
{
|
||||
self::init ();
|
||||
return self::request($url, 'get', $fields, NULL, $allow_redirects, $cert);
|
||||
}
|
||||
|
||||
/**
|
||||
* post 请求
|
||||
* $fields 有三种类型:1、数组;2、http query;3、json
|
||||
* 1、array('name'=>'yangzetao')
|
||||
* 2、http_build_query(array('name'=>'yangzetao'))
|
||||
* 3、json_encode(array('name'=>'yangzetao'))
|
||||
* 前两种是普通的post,可以用$_POST方式获取
|
||||
* 第三种是post stream( json rpc,其实就是webservice )
|
||||
* 虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
|
||||
*
|
||||
* @param mixed $url
|
||||
* @param array $fields
|
||||
* @param mixed $proxies
|
||||
* @static
|
||||
* @access public
|
||||
* @return void
|
||||
*/
|
||||
public static function post($url, $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL)
|
||||
{
|
||||
self::init ();
|
||||
return self::request($url, 'POST', $fields, $files, $allow_redirects, $cert);
|
||||
}
|
||||
|
||||
public static function put($url, $fields = array(), $allow_redirects = true, $cert = NULL)
|
||||
{
|
||||
self::init ();
|
||||
return self::request($url, 'PUT', $fields, $allow_redirects, $cert);
|
||||
}
|
||||
|
||||
public static function delete($url, $fields = array(), $allow_redirects = true, $cert = NULL)
|
||||
{
|
||||
self::init ();
|
||||
return self::request($url, 'DELETE', $fields, $allow_redirects, $cert);
|
||||
}
|
||||
|
||||
// 响应HTTP头域里的元信息
|
||||
// 此方法被用来获取请求实体的元信息而不需要传输实体主体(entity-body)
|
||||
// 此方法经常被用来测试超文本链接的有效性,可访问性,和最近的改变。.
|
||||
public static function head($url, $fields = array(), $allow_redirects = true, $cert = NULL)
|
||||
{
|
||||
self::init ();
|
||||
self::request($url, 'HEAD', $fields, $allow_redirects, $cert);
|
||||
}
|
||||
|
||||
public static function options($url, $fields = array(), $allow_redirects = true, $cert = NULL)
|
||||
{
|
||||
self::init ();
|
||||
return self::request($url, 'OPTIONS', $fields, $allow_redirects, $cert);
|
||||
}
|
||||
|
||||
public static function patch($url, $fields = array(), $allow_redirects = true, $cert = NULL)
|
||||
{
|
||||
self::init ();
|
||||
return self::request($url, 'PATCH', $fields, $allow_redirects, $cert);
|
||||
}
|
||||
|
||||
/**
|
||||
* request
|
||||
*
|
||||
* @param mixed $url 请求URL
|
||||
* @param string $method 请求方法
|
||||
* @param array $fields 表单字段
|
||||
* @param array $files 上传文件
|
||||
* @param mixed $cert CA证书
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2017-08-03 18:06
|
||||
*/
|
||||
public static function request($url, $method = 'GET', $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL)
|
||||
{
|
||||
$method = strtoupper($method);
|
||||
if(!self::_is_url($url))
|
||||
{
|
||||
self::$error = "You have requested URL ({$url}) is not a valid HTTP address";
|
||||
return false;
|
||||
}
|
||||
|
||||
// 如果是 get 方式,直接拼凑一个 url 出来
|
||||
if ($method == 'GET' && !empty($fields))
|
||||
{
|
||||
$url = $url.(strpos($url, '?') === false ? '?' : '&').http_build_query($fields);
|
||||
}
|
||||
|
||||
$parse_url = parse_url($url);
|
||||
if (empty($parse_url) || empty($parse_url['host']) || !in_array($parse_url['scheme'], array('http', 'https')))
|
||||
{
|
||||
self::$error = "No connection adapters were found for '{$url}'";
|
||||
return false;
|
||||
}
|
||||
$scheme = $parse_url['scheme'];
|
||||
$domain = $parse_url['host'];
|
||||
|
||||
// 随机绑定 hosts,做负载均衡
|
||||
if (self::$hosts)
|
||||
{
|
||||
if (isset(self::$hosts[$domain]))
|
||||
{
|
||||
$hosts = self::$hosts[$domain];
|
||||
$key = rand(0, count($hosts)-1);
|
||||
$ip = $hosts[$key];
|
||||
$url = str_replace($domain, $ip, $url);
|
||||
self::$rawheaders['Host'] = $domain;
|
||||
}
|
||||
}
|
||||
|
||||
curl_setopt( self::$ch, CURLOPT_URL, $url );
|
||||
|
||||
if ($method != 'GET')
|
||||
{
|
||||
// 如果是 post 方式
|
||||
if ($method == 'POST')
|
||||
{
|
||||
//curl_setopt( self::$ch, CURLOPT_POST, true );
|
||||
$tmpheaders = array_change_key_case(self::$rawheaders, CASE_LOWER);
|
||||
// 有些RESTful服务只接受JSON形态的数据
|
||||
// CURLOPT_POST会把上傳的文件类型设为 multipart/form-data
|
||||
// 把CURLOPT_POSTFIELDS的内容按multipart/form-data 的形式编码
|
||||
// CURLOPT_CUSTOMREQUEST可以按指定内容上传
|
||||
if ( isset($tmpheaders['content-type']) && $tmpheaders['content-type'] == 'application/json' )
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method );
|
||||
}
|
||||
else
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_POST, true );
|
||||
}
|
||||
|
||||
$file_fields = array();
|
||||
if (!empty($files))
|
||||
{
|
||||
foreach ($files as $postname => $file)
|
||||
{
|
||||
$filepath = realpath($file);
|
||||
// 如果文件不存在
|
||||
if (!file_exists($filepath))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
$filename = basename($filepath);
|
||||
$type = self::get_mimetype($filepath);
|
||||
$file_fields[$postname] = curl_file_create($filepath, $type, $filename);
|
||||
// curl -F "name=seatle&file=@/absolute/path/to/image.png" htt://localhost/uploadfile.php
|
||||
//$cfile = '@'.realpath($filename).";type=".$type.";filename=".$filename;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
self::$rawheaders['X-HTTP-Method-Override'] = $method;
|
||||
curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method );
|
||||
}
|
||||
|
||||
if ( $method == 'POST' )
|
||||
{
|
||||
// 不是上传文件的,用http_build_query, 能实现更好的兼容性,更小的请求数据包
|
||||
if ( empty($file_fields) )
|
||||
{
|
||||
// post方式
|
||||
if ( is_array($fields) )
|
||||
{
|
||||
$fields = http_build_query($fields);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// 有post数据
|
||||
if ( is_array($fields) && !empty($fields) )
|
||||
{
|
||||
// 某些server可能会有问题
|
||||
$fields = array_merge($fields, $file_fields);
|
||||
}
|
||||
else
|
||||
{
|
||||
$fields = $file_fields;
|
||||
}
|
||||
}
|
||||
|
||||
// 不能直接传数组,不知道是什么Bug,会非常慢
|
||||
curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields );
|
||||
}
|
||||
}
|
||||
|
||||
$cookies = self::get_cookies();
|
||||
$domain_cookies = self::get_cookies($domain);
|
||||
$cookies = array_merge($cookies, $domain_cookies);
|
||||
// 是否设置了cookie
|
||||
if (!empty($cookies))
|
||||
{
|
||||
foreach ($cookies as $key=>$value)
|
||||
{
|
||||
$cookie_arr[] = $key.'='.$value;
|
||||
}
|
||||
$cookies = implode('; ', $cookie_arr);
|
||||
curl_setopt(self::$ch, CURLOPT_COOKIE, $cookies);
|
||||
}
|
||||
|
||||
if (!empty(self::$useragents))
|
||||
{
|
||||
$key = rand(0, count(self::$useragents) - 1);
|
||||
self::$rawheaders['User-Agent'] = self::$useragents[$key];
|
||||
}
|
||||
|
||||
if (!empty(self::$client_ips))
|
||||
{
|
||||
$key = rand(0, count(self::$client_ips) - 1);
|
||||
self::$rawheaders['CLIENT-IP'] = self::$client_ips[$key];
|
||||
self::$rawheaders['X-FORWARDED-FOR'] = self::$client_ips[$key];
|
||||
}
|
||||
|
||||
if (self::$rawheaders)
|
||||
{
|
||||
$http_headers = array();
|
||||
foreach (self::$rawheaders as $k=>$v)
|
||||
{
|
||||
$http_headers[] = $k.': '.$v;
|
||||
}
|
||||
curl_setopt( self::$ch, CURLOPT_HTTPHEADER, $http_headers );
|
||||
}
|
||||
|
||||
curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' );
|
||||
|
||||
// 关闭验证
|
||||
if ($scheme == 'https')
|
||||
{
|
||||
curl_setopt(self::$ch, CURLOPT_SSL_VERIFYPEER, false);
|
||||
curl_setopt(self::$ch, CURLOPT_SSL_VERIFYHOST, false);
|
||||
}
|
||||
|
||||
if (self::$proxies)
|
||||
{
|
||||
$key = rand(0, count(self::$proxies) - 1);
|
||||
$proxy = self::$proxies[$key];
|
||||
curl_setopt( self::$ch, CURLOPT_PROXY, $proxy );
|
||||
}
|
||||
|
||||
// header + body,header 里面有 cookie
|
||||
curl_setopt( self::$ch, CURLOPT_HEADER, true );
|
||||
// 请求跳转后的内容
|
||||
if ($allow_redirects)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_FOLLOWLOCATION, true);
|
||||
}
|
||||
|
||||
self::$raw = curl_exec ( self::$ch );
|
||||
// 真实url
|
||||
//$location = curl_getinfo( self::$ch, CURLINFO_EFFECTIVE_URL);
|
||||
self::$info = curl_getinfo( self::$ch );
|
||||
//print_r(self::$info);
|
||||
self::$status_code = self::$info['http_code'];
|
||||
if (self::$raw === false)
|
||||
{
|
||||
self::$error = 'Curl error: ' . curl_error( self::$ch );
|
||||
//trigger_error(self::$error, E_USER_WARNING);
|
||||
}
|
||||
|
||||
// 关闭句柄
|
||||
curl_close( self::$ch );
|
||||
|
||||
// 请求成功之后才把URL存起来
|
||||
list($header, $text) = self::split_header_body();
|
||||
self::$history = self::get_history($header);
|
||||
self::$headers = self::get_response_headers($header);
|
||||
self::get_response_cookies($header, $domain);
|
||||
//$data = substr($data, 10);
|
||||
//$data = gzinflate($data);
|
||||
return $text;
|
||||
}
|
||||
|
||||
public static function get_history($header)
|
||||
{
|
||||
$status_code = 0;
|
||||
$lines = explode("\n", $header);
|
||||
foreach ($lines as $line)
|
||||
{
|
||||
$line = trim($line);
|
||||
if (preg_match("#^HTTP/.*? (\d+) Found#", $line, $out))
|
||||
{
|
||||
$status_code = empty($out[1]) ? 0 : intval($out[1]);
|
||||
}
|
||||
}
|
||||
return $status_code;
|
||||
}
|
||||
|
||||
// 获取 mimetype
|
||||
public static function get_mimetype($filepath)
|
||||
{
|
||||
$fp = finfo_open(FILEINFO_MIME);
|
||||
$mime = finfo_file($fp, $filepath);
|
||||
finfo_close($fp);
|
||||
$arr = explode(';', $mime);
|
||||
$type = empty($arr[0]) ? '' : $arr[0];
|
||||
return $type;
|
||||
}
|
||||
|
||||
/**
|
||||
* 拼凑文件和表单
|
||||
* 占时没有用到
|
||||
*
|
||||
* @param mixed $post_fields
|
||||
* @param mixed $file_fields
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2017-08-03 18:06
|
||||
*/
|
||||
public static function get_postfile_form($post_fields, $file_fields)
|
||||
{
|
||||
// 构造post数据
|
||||
$data = '';
|
||||
$delimiter = '-------------' . uniqid();
|
||||
// 表单数据
|
||||
foreach ($post_fields as $name => $content)
|
||||
{
|
||||
$data .= '--'.$delimiter."\r\n";
|
||||
$data .= 'Content-Disposition: form-data; name = "'.$name.'"';
|
||||
$data .= "\r\n\r\n";
|
||||
$data .= $content;
|
||||
$data .= "\r\n";
|
||||
}
|
||||
|
||||
foreach ($file_fields as $input_name => $file)
|
||||
{
|
||||
$data .= '--'.$delimiter."\r\n";
|
||||
$data .= 'Content-Disposition: form-data; name = "'.$input_name.'";'.
|
||||
' filename="'.$file['filename'].'"'."\r\n";
|
||||
$data .= "Content-Type: {$file['type']}\r\n";
|
||||
$data .= "\r\n";
|
||||
$data .= $file['content'];
|
||||
$data .= "\r\n";
|
||||
}
|
||||
|
||||
// 结束符
|
||||
$data .= '--'.$delimiter."--\r\n";
|
||||
|
||||
//return array(
|
||||
//CURLOPT_HTTPHEADER => array(
|
||||
//'Content-Type:multipart/form-data;boundary=' . $delimiter,
|
||||
//'Content-Length:' . strlen($data)
|
||||
//),
|
||||
//CURLOPT_POST => true,
|
||||
//CURLOPT_POSTFIELDS => $data,
|
||||
//);
|
||||
return array($delimiter, $data);
|
||||
}
|
||||
|
||||
/**
|
||||
* html encoding transform
|
||||
*
|
||||
* @param string $html
|
||||
* @param string $in
|
||||
* @param string $out
|
||||
* @param string $content
|
||||
* @param string $mode
|
||||
* auto|iconv|mb_convert_encoding
|
||||
* @return string
|
||||
*/
|
||||
public static function encoding($html, $in = null, $out = null, $mode = 'auto')
|
||||
{
|
||||
$valid = array(
|
||||
'auto',
|
||||
'iconv',
|
||||
'mb_convert_encoding',
|
||||
);
|
||||
if (isset(self::$output_encoding))
|
||||
{
|
||||
$out = self::$output_encoding;
|
||||
}
|
||||
if ( ! isset($out))
|
||||
{
|
||||
$out = 'UTF-8';
|
||||
}
|
||||
if ( ! in_array($mode, $valid))
|
||||
{
|
||||
throw new Exception('invalid mode, mode='.$mode);
|
||||
}
|
||||
$if = function_exists('mb_convert_encoding');
|
||||
$if = $if && ($mode == 'auto' || $mode == 'mb_convert_encoding');
|
||||
if (function_exists('iconv') && ($mode == 'auto' || $mode == 'iconv'))
|
||||
{
|
||||
$func = 'iconv';
|
||||
}
|
||||
elseif ($if)
|
||||
{
|
||||
$func = 'mb_convert_encoding';
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new Exception('charsetTrans failed, no function');
|
||||
}
|
||||
|
||||
$pattern = '/(<meta[^>]*?charset=([\"\']?))([a-z\d_\-]*)(\2[^>]*?>)/is';
|
||||
if ( ! isset($in))
|
||||
{
|
||||
$n = preg_match($pattern, $html, $in);
|
||||
if ($n > 0)
|
||||
{
|
||||
$in = $in[3];
|
||||
}
|
||||
else
|
||||
{
|
||||
$in = null;
|
||||
}
|
||||
if (empty($in) and function_exists('mb_detect_encoding'))
|
||||
{
|
||||
$in = mb_detect_encoding($html, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5', 'ISO-8859-1'));
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($in))
|
||||
{
|
||||
if ($in == 'ISO-8859-1')
|
||||
{
|
||||
$in = 'UTF-8';
|
||||
}
|
||||
$old = error_reporting(error_reporting() & ~E_NOTICE);
|
||||
$html = call_user_func($func, $in, $out.'//IGNORE', $html);
|
||||
error_reporting($old);
|
||||
$html = preg_replace($pattern, "\\1$out\\4", $html, 1);
|
||||
}
|
||||
return $html;
|
||||
}
|
||||
}
|
588
vendor/owner888/phpspider/core/selector.php
vendored
Normal file
588
vendor/owner888/phpspider/core/selector.php
vendored
Normal file
@ -0,0 +1,588 @@
|
||||
<?php
|
||||
// +----------------------------------------------------------------------
|
||||
// | PHPSpider [ A PHP Framework For Crawler ]
|
||||
// +----------------------------------------------------------------------
|
||||
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
|
||||
// +----------------------------------------------------------------------
|
||||
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
|
||||
// +----------------------------------------------------------------------
|
||||
// | Author: Seatle Yang <seatle@foxmail.com>
|
||||
// +----------------------------------------------------------------------
|
||||
|
||||
//----------------------------------
|
||||
// PHPSpider选择器类文件
|
||||
//----------------------------------
|
||||
|
||||
namespace phpspider\core;
|
||||
|
||||
use phpspider\library\phpquery;
|
||||
use DOMDocument;
|
||||
use DOMXpath;
|
||||
use Exception;
|
||||
|
||||
class selector
|
||||
{
|
||||
/**
|
||||
* 版本号
|
||||
* @var string
|
||||
*/
|
||||
const VERSION = '1.0.2';
|
||||
public static $dom = null;
|
||||
public static $dom_auth = '';
|
||||
public static $xpath = null;
|
||||
public static $error = null;
|
||||
|
||||
public static function select($html, $selector, $selector_type = 'xpath')
|
||||
{
|
||||
if (empty($html) || empty($selector))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
$selector_type = strtolower($selector_type);
|
||||
if ($selector_type == 'xpath')
|
||||
{
|
||||
return self::_xpath_select($html, $selector);
|
||||
}
|
||||
elseif ($selector_type == 'regex')
|
||||
{
|
||||
return self::_regex_select($html, $selector);
|
||||
}
|
||||
elseif ($selector_type == 'css')
|
||||
{
|
||||
return self::_css_select($html, $selector);
|
||||
}
|
||||
}
|
||||
|
||||
public static function remove($html, $selector, $selector_type = 'xpath')
|
||||
{
|
||||
if (empty($html) || empty($selector))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
$remove_html = "";
|
||||
$selector_type = strtolower($selector_type);
|
||||
if ($selector_type == 'xpath')
|
||||
{
|
||||
$remove_html = self::_xpath_select($html, $selector, true);
|
||||
}
|
||||
elseif ($selector_type == 'regex')
|
||||
{
|
||||
$remove_html = self::_regex_select($html, $selector, true);
|
||||
}
|
||||
elseif ($selector_type == 'css')
|
||||
{
|
||||
$remove_html = self::_css_select($html, $selector, true);
|
||||
}
|
||||
$html = str_replace($remove_html, "", $html);
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* xpath选择器
|
||||
*
|
||||
* @param mixed $html
|
||||
* @param mixed $selector
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-10-26 12:53
|
||||
*/
|
||||
private static function _xpath_select($html, $selector, $remove = false)
|
||||
{
|
||||
if (!is_object(self::$dom))
|
||||
{
|
||||
self::$dom = new DOMDocument();
|
||||
}
|
||||
|
||||
// 如果加载的不是之前的HTML内容,替换一下验证标识
|
||||
if (self::$dom_auth != md5($html))
|
||||
{
|
||||
self::$dom_auth = md5($html);
|
||||
@self::$dom->loadHTML('<?xml encoding="UTF-8">'.$html);
|
||||
self::$xpath = new DOMXpath(self::$dom);
|
||||
}
|
||||
|
||||
//libxml_use_internal_errors(true);
|
||||
//self::$dom->loadHTML('<?xml encoding="UTF-8">'.$html);
|
||||
//$errors = libxml_get_errors();
|
||||
//if (!empty($errors))
|
||||
//{
|
||||
//print_r($errors);
|
||||
//exit;
|
||||
//}
|
||||
|
||||
$elements = @self::$xpath->query($selector);
|
||||
if ($elements === false)
|
||||
{
|
||||
self::$error = "the selector in the xpath(\"{$selector}\") syntax errors";
|
||||
// 不应该返回false,因为isset(false)为true,更不能通过 !$values 去判断,因为!0为true,所以这里只能返回null
|
||||
//return false;
|
||||
return null;
|
||||
}
|
||||
|
||||
$result = array();
|
||||
if (!is_null($elements))
|
||||
{
|
||||
foreach ($elements as $element)
|
||||
{
|
||||
// 如果是删除操作,取一整块代码
|
||||
if ($remove)
|
||||
{
|
||||
$content = self::$dom->saveXml($element);
|
||||
}
|
||||
else
|
||||
{
|
||||
$nodeName = $element->nodeName;
|
||||
$nodeType = $element->nodeType; // 1.Element 2.Attribute 3.Text
|
||||
//$nodeAttr = $element->getAttribute('src');
|
||||
//$nodes = util::node_to_array(self::$dom, $element);
|
||||
//echo $nodes['@src']."\n";
|
||||
// 如果是img标签,直接取src值
|
||||
if ($nodeType == 1 && in_array($nodeName, array('img')))
|
||||
{
|
||||
$content = $element->getAttribute('src');
|
||||
}
|
||||
// 如果是标签属性,直接取节点值
|
||||
elseif ($nodeType == 2 || $nodeType == 3 || $nodeType == 4)
|
||||
{
|
||||
$content = $element->nodeValue;
|
||||
}
|
||||
else
|
||||
{
|
||||
// 保留nodeValue里的html符号,给children二次提取
|
||||
$content = self::$dom->saveXml($element);
|
||||
//$content = trim(self::$dom->saveHtml($element));
|
||||
$content = preg_replace(array("#^<{$nodeName}.*>#isU","#</{$nodeName}>$#isU"), array('', ''), $content);
|
||||
}
|
||||
}
|
||||
$result[] = $content;
|
||||
}
|
||||
}
|
||||
if (empty($result))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
// 如果只有一个元素就直接返回string,否则返回数组
|
||||
return count($result) > 1 ? $result : $result[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* css选择器
|
||||
*
|
||||
* @param mixed $html
|
||||
* @param mixed $selector
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-10-26 12:53
|
||||
*/
|
||||
private static function _css_select($html, $selector, $remove = false)
|
||||
{
|
||||
$selector = self::css_to_xpath($selector);
|
||||
//echo $selector."\n";
|
||||
//exit("\n");
|
||||
return self::_xpath_select($html, $selector, $remove);
|
||||
// 如果加载的不是之前的HTML内容,替换一下验证标识
|
||||
//if (self::$dom_auth['css'] != md5($html))
|
||||
//{
|
||||
//self::$dom_auth['css'] = md5($html);
|
||||
//phpQuery::loadDocumentHTML($html);
|
||||
//}
|
||||
//if ($remove)
|
||||
//{
|
||||
//return phpQuery::pq($selector)->remove();
|
||||
//}
|
||||
//else
|
||||
//{
|
||||
//return phpQuery::pq($selector)->html();
|
||||
//}
|
||||
}
|
||||
|
||||
/**
|
||||
* 正则选择器
|
||||
*
|
||||
* @param mixed $html
|
||||
* @param mixed $selector
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-10-26 12:53
|
||||
*/
|
||||
private static function _regex_select($html, $selector, $remove = false)
|
||||
{
|
||||
if(@preg_match_all($selector, $html, $out) === false)
|
||||
{
|
||||
self::$error = "the selector in the regex(\"{$selector}\") syntax errors";
|
||||
return null;
|
||||
}
|
||||
$count = count($out);
|
||||
$result = array();
|
||||
// 一个都没有匹配到
|
||||
if ($count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
// 只匹配一个,就是只有一个 ()
|
||||
elseif ($count == 2)
|
||||
{
|
||||
// 删除的话取匹配到的所有内容
|
||||
if ($remove)
|
||||
{
|
||||
$result = $out[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
$result = $out[1];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for ($i = 1; $i < $count; $i++)
|
||||
{
|
||||
// 如果只有一个元素,就直接返回好了
|
||||
$result[] = count($out[$i]) > 1 ? $out[$i] : $out[$i][0];
|
||||
}
|
||||
}
|
||||
if (empty($result))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return count($result) > 1 ? $result : $result[0];
|
||||
}
|
||||
|
||||
public static function find_all($html, $selector)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
public static function css_to_xpath($selectors)
|
||||
{
|
||||
$queries = self::parse_selector($selectors);
|
||||
$delimiter_before = false;
|
||||
$xquery = '';
|
||||
foreach($queries as $s)
|
||||
{
|
||||
// TAG
|
||||
$is_tag = preg_match('@^[\w|\||-]+$@', $s) || $s == '*';
|
||||
if ($is_tag)
|
||||
{
|
||||
$xquery .= $s;
|
||||
}
|
||||
// ID
|
||||
else if ($s[0] == '#')
|
||||
{
|
||||
if ($delimiter_before)
|
||||
{
|
||||
$xquery .= '*';
|
||||
}
|
||||
// ID用精确查询
|
||||
$xquery .= "[@id='".substr($s, 1)."']";
|
||||
}
|
||||
// CLASSES
|
||||
else if ($s[0] == '.')
|
||||
{
|
||||
if ($delimiter_before)
|
||||
{
|
||||
$xquery .= '*';
|
||||
}
|
||||
// CLASS用模糊查询
|
||||
$xquery .= "[contains(@class,'".substr($s, 1)."')]";
|
||||
}
|
||||
// ATTRIBUTES
|
||||
else if ($s[0] == '[')
|
||||
{
|
||||
if ($delimiter_before)
|
||||
{
|
||||
$xquery .= '*';
|
||||
}
|
||||
// strip side brackets
|
||||
$attr = trim($s, '][');
|
||||
// attr with specifed value
|
||||
if (mb_strpos($s, '='))
|
||||
{
|
||||
$value = null;
|
||||
list($attr, $value) = explode('=', $attr);
|
||||
$value = trim($value, "'\"");
|
||||
if (self::is_regexp($attr))
|
||||
{
|
||||
// cut regexp character
|
||||
$attr = substr($attr, 0, -1);
|
||||
$xquery .= "[@{$attr}]";
|
||||
}
|
||||
else
|
||||
{
|
||||
$xquery .= "[@{$attr}='{$value}']";
|
||||
}
|
||||
}
|
||||
// attr without specified value
|
||||
else
|
||||
{
|
||||
$xquery .= "[@{$attr}]";
|
||||
}
|
||||
}
|
||||
// ~ General Sibling Selector
|
||||
else if ($s[0] == '~')
|
||||
{
|
||||
}
|
||||
// + Adjacent sibling selectors
|
||||
else if ($s[0] == '+')
|
||||
{
|
||||
}
|
||||
// PSEUDO CLASSES
|
||||
else if ($s[0] == ':')
|
||||
{
|
||||
}
|
||||
// DIRECT DESCENDANDS
|
||||
else if ($s == '>')
|
||||
{
|
||||
$xquery .= '/';
|
||||
$delimiter_before = 2;
|
||||
}
|
||||
// ALL DESCENDANDS
|
||||
else if ($s == ' ')
|
||||
{
|
||||
$xquery .= '//';
|
||||
$delimiter_before = 2;
|
||||
}
|
||||
// ERRORS
|
||||
else
|
||||
{
|
||||
exit("Unrecognized token '$s'");
|
||||
}
|
||||
$delimiter_before = $delimiter_before === 2;
|
||||
}
|
||||
return $xquery;
|
||||
}
|
||||
|
||||
/**
|
||||
* @access private
|
||||
*/
|
||||
public static function parse_selector($query)
|
||||
{
|
||||
$query = trim( preg_replace( '@\s+@', ' ', preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query) ) );
|
||||
$queries = array();
|
||||
if ( !$query )
|
||||
{
|
||||
return $queries;
|
||||
}
|
||||
|
||||
$special_chars = array('>',' ');
|
||||
$special_chars_mapping = array();
|
||||
$strlen = mb_strlen($query);
|
||||
$class_chars = array('.', '-');
|
||||
$pseudo_chars = array('-');
|
||||
$tag_chars = array('*', '|', '-');
|
||||
// split multibyte string
|
||||
// http://code.google.com/p/phpquery/issues/detail?id=76
|
||||
$_query = array();
|
||||
for ( $i=0; $i<$strlen; $i++ )
|
||||
{
|
||||
$_query[] = mb_substr($query, $i, 1);
|
||||
}
|
||||
$query = $_query;
|
||||
// it works, but i dont like it...
|
||||
$i = 0;
|
||||
while( $i < $strlen )
|
||||
{
|
||||
$c = $query[$i];
|
||||
$tmp = '';
|
||||
// TAG
|
||||
if ( self::is_char($c) || in_array($c, $tag_chars) )
|
||||
{
|
||||
while(isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $tag_chars)))
|
||||
{
|
||||
$tmp .= $query[$i];
|
||||
$i++;
|
||||
}
|
||||
$queries[] = $tmp;
|
||||
}
|
||||
// IDs
|
||||
else if ( $c == '#' )
|
||||
{
|
||||
$i++;
|
||||
while( isset($query[$i]) && (self::is_char($query[$i]) || $query[$i] == '-') )
|
||||
{
|
||||
$tmp .= $query[$i];
|
||||
$i++;
|
||||
}
|
||||
$queries[] = '#'.$tmp;
|
||||
}
|
||||
// SPECIAL CHARS
|
||||
else if ( in_array($c, $special_chars) )
|
||||
{
|
||||
$queries[] = $c;
|
||||
$i++;
|
||||
// MAPPED SPECIAL MULTICHARS
|
||||
// } else if ( $c.$query[$i+1] == '//') {
|
||||
// $return[] = ' ';
|
||||
// $i = $i+2;
|
||||
}
|
||||
// MAPPED SPECIAL CHARS
|
||||
else if ( isset($special_chars_mapping[$c]))
|
||||
{
|
||||
$queries[] = $special_chars_mapping[$c];
|
||||
$i++;
|
||||
}
|
||||
// COMMA
|
||||
else if ( $c == ',' )
|
||||
{
|
||||
$i++;
|
||||
while( isset($query[$i]) && $query[$i] == ' ')
|
||||
{
|
||||
$i++;
|
||||
}
|
||||
}
|
||||
// CLASSES
|
||||
else if ($c == '.')
|
||||
{
|
||||
while( isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $class_chars)))
|
||||
{
|
||||
$tmp .= $query[$i];
|
||||
$i++;
|
||||
}
|
||||
$queries[] = $tmp;
|
||||
}
|
||||
// ~ General Sibling Selector
|
||||
else if ($c == '~')
|
||||
{
|
||||
$space_allowed = true;
|
||||
$tmp .= $query[$i++];
|
||||
while( isset($query[$i])
|
||||
&& (self::is_char($query[$i])
|
||||
|| in_array($query[$i], $class_chars)
|
||||
|| $query[$i] == '*'
|
||||
|| ($query[$i] == ' ' && $space_allowed)
|
||||
))
|
||||
{
|
||||
if ($query[$i] != ' ')
|
||||
{
|
||||
$space_allowed = false;
|
||||
}
|
||||
$tmp .= $query[$i];
|
||||
$i++;
|
||||
}
|
||||
$queries[] = $tmp;
|
||||
}
|
||||
// + Adjacent sibling selectors
|
||||
else if ($c == '+')
|
||||
{
|
||||
$space_allowed = true;
|
||||
$tmp .= $query[$i++];
|
||||
while( isset($query[$i])
|
||||
&& (self::is_char($query[$i])
|
||||
|| in_array($query[$i], $class_chars)
|
||||
|| $query[$i] == '*'
|
||||
|| ($space_allowed && $query[$i] == ' ')
|
||||
))
|
||||
{
|
||||
if ($query[$i] != ' ')
|
||||
$space_allowed = false;
|
||||
$tmp .= $query[$i];
|
||||
$i++;
|
||||
}
|
||||
$queries[] = $tmp;
|
||||
}
|
||||
// ATTRS
|
||||
else if ($c == '[')
|
||||
{
|
||||
$stack = 1;
|
||||
$tmp .= $c;
|
||||
while( isset($query[++$i]))
|
||||
{
|
||||
$tmp .= $query[$i];
|
||||
if ( $query[$i] == '[')
|
||||
{
|
||||
$stack++;
|
||||
}
|
||||
else if ( $query[$i] == ']')
|
||||
{
|
||||
$stack--;
|
||||
if (! $stack )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
$queries[] = $tmp;
|
||||
$i++;
|
||||
}
|
||||
// PSEUDO CLASSES
|
||||
else if ($c == ':')
|
||||
{
|
||||
$stack = 1;
|
||||
$tmp .= $query[$i++];
|
||||
while( isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $pseudo_chars)))
|
||||
{
|
||||
$tmp .= $query[$i];
|
||||
$i++;
|
||||
}
|
||||
// with arguments ?
|
||||
if ( isset($query[$i]) && $query[$i] == '(')
|
||||
{
|
||||
$tmp .= $query[$i];
|
||||
$stack = 1;
|
||||
while( isset($query[++$i]))
|
||||
{
|
||||
$tmp .= $query[$i];
|
||||
if ( $query[$i] == '(')
|
||||
{
|
||||
$stack++;
|
||||
}
|
||||
else if ( $query[$i] == ')')
|
||||
{
|
||||
$stack--;
|
||||
if (! $stack )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
$queries[] = $tmp;
|
||||
$i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
$queries[] = $tmp;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
$i++;
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($queries[0]))
|
||||
{
|
||||
if (isset($queries[0][0]) && $queries[0][0] == ':')
|
||||
{
|
||||
array_unshift($queries, '*');
|
||||
}
|
||||
if ($queries[0] != '>')
|
||||
{
|
||||
array_unshift($queries, ' ');
|
||||
}
|
||||
}
|
||||
|
||||
return $queries;
|
||||
}
|
||||
|
||||
public static function is_char($char)
|
||||
{
|
||||
return preg_match('@\w@', $char);
|
||||
}
|
||||
|
||||
/**
|
||||
* 模糊匹配
|
||||
* ^ 前缀字符串
|
||||
* * 包含字符串
|
||||
* $ 后缀字符串
|
||||
* @access private
|
||||
*/
|
||||
protected static function is_regexp($pattern)
|
||||
{
|
||||
return in_array(
|
||||
$pattern[ mb_strlen($pattern)-1 ],
|
||||
array('^','*','$')
|
||||
);
|
||||
}
|
||||
}
|
936
vendor/owner888/phpspider/core/util.php
vendored
Normal file
936
vendor/owner888/phpspider/core/util.php
vendored
Normal file
@ -0,0 +1,936 @@
|
||||
<?php
|
||||
// +----------------------------------------------------------------------
|
||||
// | PHPSpider [ A PHP Framework For Crawler ]
|
||||
// +----------------------------------------------------------------------
|
||||
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
|
||||
// +----------------------------------------------------------------------
|
||||
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
|
||||
// +----------------------------------------------------------------------
|
||||
// | Author: Seatle Yang <seatle@foxmail.com>
|
||||
// +----------------------------------------------------------------------
|
||||
|
||||
//----------------------------------
|
||||
// PHPSpider实用函数集合类文件
|
||||
//----------------------------------
|
||||
|
||||
namespace phpspider\core;
|
||||
// 引入PATH_DATA
|
||||
require_once __DIR__ . '/constants.php';
|
||||
|
||||
class util
|
||||
{
|
||||
/**
|
||||
* 文件锁
|
||||
* 如果没有锁,就加一把锁并且执行逻辑,然后删除锁
|
||||
* if (!util::lock('statistics_offer'))
|
||||
* {
|
||||
* util::lock('statistics_offer');
|
||||
* ...
|
||||
* util::unlock('statistics_offer');
|
||||
* }
|
||||
* 否则输出锁存在
|
||||
* else
|
||||
* {
|
||||
* echo "process has been locked\n";
|
||||
* }
|
||||
*
|
||||
* @param mixed $lock_name
|
||||
* @param int $lock_timeout
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-02-18 14:28
|
||||
*/
|
||||
public static function lock($lock_name, $lock_timeout = 600)
|
||||
{
|
||||
$lock = util::get_file(PATH_DATA."/lock/{$lock_name}.lock");
|
||||
if ($lock)
|
||||
{
|
||||
$time = time() - $lock;
|
||||
// 还没到10分钟,说明进程还活着
|
||||
if ($time < $lock_timeout)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
unlink(PATH_DATA."/lock/{$lock_name}.lock");
|
||||
}
|
||||
util::put_file(PATH_DATA."/lock/{$lock_name}.lock", time());
|
||||
return false;
|
||||
}
|
||||
|
||||
public static function unlock($lock_name)
|
||||
{
|
||||
unlink(PATH_DATA."/lock/{$lock_name}.lock");
|
||||
}
|
||||
|
||||
public static function time2second($time, $is_log = true)
|
||||
{
|
||||
if(is_numeric($time))
|
||||
{
|
||||
$value = array(
|
||||
"years" => 0, "days" => 0, "hours" => 0,
|
||||
"minutes" => 0, "seconds" => 0,
|
||||
);
|
||||
if($time >= 31556926)
|
||||
{
|
||||
$value["years"] = floor($time/31556926);
|
||||
$time = ($time%31556926);
|
||||
}
|
||||
if($time >= 86400)
|
||||
{
|
||||
$value["days"] = floor($time/86400);
|
||||
$time = ($time%86400);
|
||||
}
|
||||
if($time >= 3600)
|
||||
{
|
||||
$value["hours"] = floor($time/3600);
|
||||
$time = ($time%3600);
|
||||
}
|
||||
if($time >= 60)
|
||||
{
|
||||
$value["minutes"] = floor($time/60);
|
||||
$time = ($time%60);
|
||||
}
|
||||
$value["seconds"] = floor($time);
|
||||
//return (array) $value;
|
||||
//$t = $value["years"] ."y ". $value["days"] ."d ". $value["hours"] ."h ". $value["minutes"] ."m ".$value["seconds"]."s";
|
||||
if ($is_log)
|
||||
{
|
||||
$t = $value["days"] ."d ". $value["hours"] ."h ". $value["minutes"] ."m ".$value["seconds"]."s";
|
||||
}
|
||||
else
|
||||
{
|
||||
$t = $value["days"] ." days ". $value["hours"] ." hours ". $value["minutes"] ." minutes";
|
||||
}
|
||||
return $t;
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static function get_days($day_sta, $day_end = true, $range = 86400)
|
||||
{
|
||||
if ($day_end === true) $day_end = date('Y-m-d');
|
||||
|
||||
return array_map(function ($time) {
|
||||
return date('Y-m-d', $time);
|
||||
}, range(strtotime($day_sta), strtotime($day_end), $range));
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文件行数
|
||||
*
|
||||
* @param mixed $filepath
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-03-31 21:54
|
||||
*/
|
||||
public static function get_file_line($filepath)
|
||||
{
|
||||
$line = 0 ;
|
||||
$fp = fopen($filepath , 'r');
|
||||
if (!$fp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
//获取文件的一行内容,注意:需要php5才支持该函数;
|
||||
while( stream_get_line($fp,8192,"\n") ){
|
||||
$line++;
|
||||
}
|
||||
fclose($fp);//关闭文件
|
||||
return $line;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得表数
|
||||
*
|
||||
* @param mixed $table_name 表名
|
||||
* @param mixed $item_value 唯一索引
|
||||
* @param int $table_num 表数量
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2015-10-22 23:25
|
||||
*/
|
||||
public static function get_table_num($item_value, $table_num = 100)
|
||||
{
|
||||
//sha1:返回一个40字符长度的16进制数字
|
||||
$item_value = sha1(strtolower($item_value));
|
||||
//base_convert:进制建转换,下面是把16进制转成10进制,方便做除法运算
|
||||
//str_pad:把字符串填充为指定的长度,下面是在左边加0,表数量大于100就3位,否则2位
|
||||
$step = $table_num > 100 ? 3 : 2;
|
||||
$item_value = str_pad(base_convert(substr($item_value, -2), 16, 10) % $table_num, $step, "0", STR_PAD_LEFT);
|
||||
return $item_value;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得表面
|
||||
*
|
||||
* @param mixed $table_name 表名
|
||||
* @param mixed $item_value 唯一索引
|
||||
* @param int $table_num 表数量
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2015-10-22 23:25
|
||||
*/
|
||||
public static function get_table_name($table_name, $item_value, $table_num = 100)
|
||||
{
|
||||
//sha1:返回一个40字符长度的16进制数字
|
||||
$item_value = sha1(strtolower($item_value));
|
||||
//base_convert:进制建转换,下面是把16进制转成10进制,方便做除法运算
|
||||
//str_pad:把字符串填充为指定的长度,下面是在左边加0,共3位
|
||||
$step = $table_num > 100 ? 3 : 2;
|
||||
$item_value = str_pad(base_convert(substr($item_value, -2), 16, 10) % $table_num, $step, "0", STR_PAD_LEFT);
|
||||
return $table_name."_".$item_value;
|
||||
}
|
||||
|
||||
// 获得当前使用内存
|
||||
public static function memory_get_usage()
|
||||
{
|
||||
$memory = memory_get_usage();
|
||||
return self::format_bytes($memory);
|
||||
}
|
||||
|
||||
// 获得最高使用内存
|
||||
public static function memory_get_peak_usage()
|
||||
{
|
||||
$memory = memory_get_peak_usage();
|
||||
return self::format_bytes($memory);
|
||||
}
|
||||
|
||||
// 转换大小单位
|
||||
public static function format_bytes($size)
|
||||
{
|
||||
$unit = array('b', 'kb', 'mb', 'gb', 'tb', 'pb');
|
||||
return @round($size / pow(1024, ($i = floor(log($size, 1024)))), 2) . ' ' . $unit[$i];
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取数组大小
|
||||
*
|
||||
* @param mixed $arr 数组
|
||||
* @return string
|
||||
*/
|
||||
public static function array_size($arr)
|
||||
{
|
||||
ob_start();
|
||||
print_r($arr);
|
||||
$mem = ob_get_contents();
|
||||
ob_end_clean();
|
||||
$mem = preg_replace("/\n +/", "", $mem);
|
||||
$mem = strlen($mem);
|
||||
return self::format_bytes($mem);
|
||||
}
|
||||
|
||||
/**
|
||||
* 数字随机数
|
||||
*
|
||||
* @param int $num
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public static function rand_num($num = 7)
|
||||
{
|
||||
$rand = "";
|
||||
for ($i = 0; $i < $num; $i ++)
|
||||
{
|
||||
$rand .= mt_rand(0, 9);
|
||||
}
|
||||
return $rand;
|
||||
}
|
||||
|
||||
/**
|
||||
* 字母数字混合随机数
|
||||
*
|
||||
* @param int $num
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public static function rand_str($num = 10)
|
||||
{
|
||||
$chars = 'abcdefghijklmnopqrstuvwxyz0123456789';
|
||||
$string = "";
|
||||
for ($i = 0; $i < $num; $i ++)
|
||||
{
|
||||
$string .= substr($chars, rand(0, strlen($chars)), 1);
|
||||
}
|
||||
return $string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 汉字转拼音
|
||||
*
|
||||
* @param mixed $str 汉字
|
||||
* @param int $ishead
|
||||
* @param int $isclose
|
||||
* @static
|
||||
* @access public
|
||||
* @return string
|
||||
*/
|
||||
public static function pinyin($str, $ishead = 0, $isclose = 1)
|
||||
{
|
||||
// $str = iconv("utf-8", "gbk//ignore", $str);
|
||||
$str = mb_convert_encoding($str, "gbk", "utf-8");
|
||||
global $pinyins;
|
||||
$restr = '';
|
||||
$str = trim($str);
|
||||
$slen = strlen($str);
|
||||
if ($slen < 2)
|
||||
{
|
||||
return $str;
|
||||
}
|
||||
if (count($pinyins) == 0)
|
||||
{
|
||||
$fp = fopen(PATH_DATA . '/pinyin.dat', 'r');
|
||||
while (!feof($fp))
|
||||
{
|
||||
$line = trim(fgets($fp));
|
||||
$pinyins[$line[0] . $line[1]] = substr($line, 3, strlen($line) - 3);
|
||||
}
|
||||
fclose($fp);
|
||||
}
|
||||
for ($i = 0; $i < $slen; $i ++)
|
||||
{
|
||||
if (ord($str[$i]) > 0x80)
|
||||
{
|
||||
$c = $str[$i] . $str[$i + 1];
|
||||
$i ++;
|
||||
if (isset($pinyins[$c]))
|
||||
{
|
||||
if ($ishead == 0)
|
||||
{
|
||||
$restr .= $pinyins[$c];
|
||||
}
|
||||
else
|
||||
{
|
||||
$restr .= $pinyins[$c][0];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// $restr .= "_";
|
||||
}
|
||||
}
|
||||
else if (preg_match("/[a-z0-9]/i", $str[$i]))
|
||||
{
|
||||
$restr .= $str[$i];
|
||||
}
|
||||
else
|
||||
{
|
||||
// $restr .= "_";
|
||||
}
|
||||
}
|
||||
if ($isclose == 0)
|
||||
{
|
||||
unset($pinyins);
|
||||
}
|
||||
return $restr;
|
||||
}
|
||||
|
||||
/**
|
||||
* 生成字母前缀
|
||||
*
|
||||
* @param mixed $s0
|
||||
* @return char
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public static function letter_first($s0)
|
||||
{
|
||||
$firstchar_ord = ord(strtoupper($s0{0}));
|
||||
if (($firstchar_ord >= 65 and $firstchar_ord <= 91) or ($firstchar_ord >= 48 and $firstchar_ord <= 57)) return $s0{0};
|
||||
// $s = iconv("utf-8", "gbk//ignore", $s0);
|
||||
$s = mb_convert_encoding($s0, "gbk", "utf-8");
|
||||
$asc = ord($s{0}) * 256 + ord($s{1}) - 65536;
|
||||
if ($asc >= -20319 and $asc <= -20284) return "A";
|
||||
if ($asc >= -20283 and $asc <= -19776) return "B";
|
||||
if ($asc >= -19775 and $asc <= -19219) return "C";
|
||||
if ($asc >= -19218 and $asc <= -18711) return "D";
|
||||
if ($asc >= -18710 and $asc <= -18527) return "E";
|
||||
if ($asc >= -18526 and $asc <= -18240) return "F";
|
||||
if ($asc >= -18239 and $asc <= -17923) return "G";
|
||||
if ($asc >= -17922 and $asc <= -17418) return "H";
|
||||
if ($asc >= -17417 and $asc <= -16475) return "J";
|
||||
if ($asc >= -16474 and $asc <= -16213) return "K";
|
||||
if ($asc >= -16212 and $asc <= -15641) return "L";
|
||||
if ($asc >= -15640 and $asc <= -15166) return "M";
|
||||
if ($asc >= -15165 and $asc <= -14923) return "N";
|
||||
if ($asc >= -14922 and $asc <= -14915) return "O";
|
||||
if ($asc >= -14914 and $asc <= -14631) return "P";
|
||||
if ($asc >= -14630 and $asc <= -14150) return "Q";
|
||||
if ($asc >= -14149 and $asc <= -14091) return "R";
|
||||
if ($asc >= -14090 and $asc <= -13319) return "S";
|
||||
if ($asc >= -13318 and $asc <= -12839) return "T";
|
||||
if ($asc >= -12838 and $asc <= -12557) return "W";
|
||||
if ($asc >= -12556 and $asc <= -11848) return "X";
|
||||
if ($asc >= -11847 and $asc <= -11056) return "Y";
|
||||
if ($asc >= -11055 and $asc <= -10247) return "Z";
|
||||
return 0; // null
|
||||
}
|
||||
|
||||
/**
|
||||
* 获得某天前的时间戳
|
||||
*
|
||||
* @param mixed $day
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public static function getxtime($day)
|
||||
{
|
||||
$day = intval($day);
|
||||
return mktime(23, 59, 59, date("m"), date("d") - $day, date("y"));
|
||||
}
|
||||
|
||||
/**
|
||||
* 读文件
|
||||
*/
|
||||
public static function get_file($url, $timeout = 10)
|
||||
{
|
||||
if (function_exists('curl_init'))
|
||||
{
|
||||
$ch = curl_init();
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_HEADER, 0);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
||||
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
|
||||
$content = curl_exec($ch);
|
||||
curl_close($ch);
|
||||
if ($content) return $content;
|
||||
}
|
||||
$ctx = stream_context_create(array('http' => array('timeout' => $timeout)));
|
||||
$content = @file_get_contents($url, 0, $ctx);
|
||||
if ($content) return $content;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 写文件,如果文件目录不存在,则递归生成
|
||||
*/
|
||||
public static function put_file($file, $content, $flag = 0)
|
||||
{
|
||||
$pathinfo = pathinfo($file);
|
||||
if (!empty($pathinfo['dirname']))
|
||||
{
|
||||
if (file_exists($pathinfo['dirname']) === false)
|
||||
{
|
||||
if (@mkdir($pathinfo['dirname'], 0777, true) === false)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($flag === FILE_APPEND)
|
||||
{
|
||||
// 多个php-fpm写一个文件的时候容易丢失,要加锁
|
||||
//return @file_put_contents($file, $content, FILE_APPEND|LOCK_EX);
|
||||
return @file_put_contents($file, $content, FILE_APPEND);
|
||||
}
|
||||
else
|
||||
{
|
||||
return @file_put_contents($file, $content, LOCK_EX);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查路径是否存在,不存在则递归生成路径
|
||||
*
|
||||
* @param mixed $path 路径
|
||||
* @static
|
||||
* @access public
|
||||
* @return bool or string
|
||||
*/
|
||||
public static function path_exists($path)
|
||||
{
|
||||
$pathinfo = pathinfo($path . '/tmp.txt');
|
||||
if (!empty($pathinfo['dirname']))
|
||||
{
|
||||
if (file_exists($pathinfo['dirname']) === false)
|
||||
{
|
||||
if (mkdir($pathinfo['dirname'], 0777, true) === false)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return $path;
|
||||
}
|
||||
|
||||
/**
|
||||
* 递归删除目录
|
||||
*
|
||||
* @param mixed $dir
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public static function deldir($dir)
|
||||
{
|
||||
//先删除目录下的文件:
|
||||
$dh = opendir($dir);
|
||||
while ($file = readdir($dh))
|
||||
{
|
||||
if($file!="." && $file!="..")
|
||||
{
|
||||
$fullpath = $dir."/".$file;
|
||||
if(!is_dir($fullpath))
|
||||
{
|
||||
unlink($fullpath);
|
||||
}
|
||||
else
|
||||
{
|
||||
self::deldir($fullpath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
closedir($dh);
|
||||
//删除当前文件夹:
|
||||
if(rmdir($dir))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 递归修改目录权限
|
||||
*
|
||||
* @param mixed $path 目录
|
||||
* @param mixed $filemode 权限
|
||||
* @return bool
|
||||
*/
|
||||
public static function chmodr($path, $filemode)
|
||||
{
|
||||
if (!is_dir($path))
|
||||
{
|
||||
return @chmod($path, $filemode);
|
||||
}
|
||||
|
||||
$dh = opendir($path);
|
||||
while (($file = readdir($dh)) !== false)
|
||||
{
|
||||
if ($file != '.' && $file != '..')
|
||||
{
|
||||
$fullpath = $path . '/' . $file;
|
||||
if (is_link($fullpath))
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
elseif (!is_dir($fullpath) && !@chmod($fullpath, $filemode))
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
elseif (!self::chmodr($fullpath, $filemode))
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
closedir($dh);
|
||||
|
||||
if (@chmod($path, $filemode))
|
||||
{
|
||||
return TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 数组格式化为CSV
|
||||
*
|
||||
* @param mixed $data
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-07-29 11:32
|
||||
*/
|
||||
public static function format_csv($data)
|
||||
{
|
||||
foreach ($data as $k=>$v)
|
||||
{
|
||||
$v = str_replace(",", "", $v);
|
||||
$v = str_replace(",", "", $v);
|
||||
$data[$k] = $v;
|
||||
}
|
||||
return implode(",", $data);
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否为utf8字符串
|
||||
* @parem $str
|
||||
* @return bool
|
||||
*/
|
||||
public static function is_utf8($str)
|
||||
{
|
||||
if ($str === mb_convert_encoding(mb_convert_encoding($str, "UTF-32", "UTF-8"), "UTF-8", "UTF-32"))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文件编码
|
||||
* @param $string
|
||||
* @return string
|
||||
*/
|
||||
public static function get_encoding($string)
|
||||
{
|
||||
$encoding = mb_detect_encoding($string, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5'));
|
||||
return strtolower($encoding);
|
||||
}
|
||||
|
||||
/**
|
||||
* 转换数组值的编码格式
|
||||
* @param array $arr
|
||||
* @param string $toEncoding
|
||||
* @param string $fromEncoding
|
||||
* @return array
|
||||
*/
|
||||
public static function array_iconv($arr, $from_encoding, $to_encoding)
|
||||
{
|
||||
eval('$arr = '.iconv($from_encoding, $to_encoding.'//IGNORE', var_export($arr,TRUE)).';');
|
||||
return $arr;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从普通时间返回Linux时间截(strtotime中文处理版)
|
||||
* @parem string $dtime
|
||||
* @return int
|
||||
*/
|
||||
public static function cn_strtotime($dtime)
|
||||
{
|
||||
if (!preg_match("/[^0-9]/", $dtime))
|
||||
{
|
||||
return $dtime;
|
||||
}
|
||||
$dtime = trim($dtime);
|
||||
$dt = Array(1970, 1, 1, 0, 0, 0);
|
||||
$dtime = preg_replace("/[\r\n\t]|日|秒/", " ", $dtime);
|
||||
$dtime = str_replace("年", "-", $dtime);
|
||||
$dtime = str_replace("月", "-", $dtime);
|
||||
$dtime = str_replace("时", ":", $dtime);
|
||||
$dtime = str_replace("分", ":", $dtime);
|
||||
$dtime = trim(preg_replace("/[ ]{1,}/", " ", $dtime));
|
||||
$ds = explode(" ", $dtime);
|
||||
$ymd = explode("-", $ds[0]);
|
||||
if (!isset($ymd[1]))
|
||||
{
|
||||
$ymd = explode(".", $ds[0]);
|
||||
}
|
||||
if (isset($ymd[0]))
|
||||
{
|
||||
$dt[0] = $ymd[0];
|
||||
}
|
||||
if (isset($ymd[1])) $dt[1] = $ymd[1];
|
||||
if (isset($ymd[2])) $dt[2] = $ymd[2];
|
||||
if (strlen($dt[0]) == 2) $dt[0] = '20' . $dt[0];
|
||||
if (isset($ds[1]))
|
||||
{
|
||||
$hms = explode(":", $ds[1]);
|
||||
if (isset($hms[0])) $dt[3] = $hms[0];
|
||||
if (isset($hms[1])) $dt[4] = $hms[1];
|
||||
if (isset($hms[2])) $dt[5] = $hms[2];
|
||||
}
|
||||
foreach ($dt as $k => $v)
|
||||
{
|
||||
$v = preg_replace("/^0{1,}/", '', trim($v));
|
||||
if ($v == '')
|
||||
{
|
||||
$dt[$k] = 0;
|
||||
}
|
||||
}
|
||||
$mt = mktime($dt[3], $dt[4], $dt[5], $dt[1], $dt[2], $dt[0]);
|
||||
if (!empty($mt))
|
||||
{
|
||||
return $mt;
|
||||
}
|
||||
else
|
||||
{
|
||||
return strtotime($dtime);
|
||||
}
|
||||
}
|
||||
|
||||
public static function cn_substr($string, $length = 80, $etc = '...', $count_words = true)
|
||||
{
|
||||
mb_internal_encoding("UTF-8");
|
||||
if ($length == 0) return '';
|
||||
if (strlen($string) <= $length) return $string;
|
||||
preg_match_all("/[\x01-\x7f]|[\xc2-\xdf][\x80-\xbf]|\xe0[\xa0-\xbf][\x80-\xbf]|[\xe1-\xef][\x80-\xbf][\x80-\xbf]|\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|[\xf1-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]/", $string, $info);
|
||||
if ($count_words)
|
||||
{
|
||||
$j = 0;
|
||||
$wordscut = "";
|
||||
for ($i = 0; $i < count($info[0]); $i ++)
|
||||
{
|
||||
$wordscut .= $info[0][$i];
|
||||
if (ord($info[0][$i]) >= 128)
|
||||
{
|
||||
$j = $j + 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
$j = $j + 1;
|
||||
}
|
||||
if ($j >= $length)
|
||||
{
|
||||
return $wordscut . $etc;
|
||||
}
|
||||
}
|
||||
return join('', $info[0]);
|
||||
}
|
||||
return join("", array_slice($info[0], 0, $length)) . $etc;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文件后缀名
|
||||
*
|
||||
* @param mixed $file_name 文件名
|
||||
* @static
|
||||
*
|
||||
* @access public
|
||||
* @return string
|
||||
*/
|
||||
public static function get_extension($file_name)
|
||||
{
|
||||
$ext = explode('.', $file_name);
|
||||
$ext = array_pop($ext);
|
||||
return strtolower($ext);
|
||||
}
|
||||
|
||||
// 获取 Url 跳转后的真实地址
|
||||
public static function getrealurl($url)
|
||||
{
|
||||
if (empty($url))
|
||||
{
|
||||
return $url;
|
||||
}
|
||||
$header = get_headers($url, 1);
|
||||
if (empty($header[0]) || empty($header[1]))
|
||||
{
|
||||
return $url;
|
||||
}
|
||||
if (strpos($header[0], '301') || strpos($header[0], '302'))
|
||||
{
|
||||
if (empty($header['Location']))
|
||||
{
|
||||
return $url;
|
||||
}
|
||||
if (is_array($header['Location']))
|
||||
{
|
||||
return $header['Location'][count($header['Location']) - 1];
|
||||
}
|
||||
else
|
||||
{
|
||||
return $header['Location'];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return $url;
|
||||
}
|
||||
}
|
||||
|
||||
// 解压服务器用 Content-Encoding:gzip 压缩过的数据
|
||||
public static function gzdecode($data)
|
||||
{
|
||||
$flags = ord(substr($data, 3, 1));
|
||||
$headerlen = 10;
|
||||
$extralen = 0;
|
||||
$filenamelen = 0;
|
||||
if ($flags & 4)
|
||||
{
|
||||
$extralen = unpack('v', substr($data, 10, 2));
|
||||
$extralen = $extralen[1];
|
||||
$headerlen += 2 + $extralen;
|
||||
}
|
||||
if ($flags & 8) // Filename
|
||||
$headerlen = strpos($data, chr(0), $headerlen) + 1;
|
||||
if ($flags & 16) // Comment
|
||||
$headerlen = strpos($data, chr(0), $headerlen) + 1;
|
||||
if ($flags & 2) // CRC at end of file
|
||||
$headerlen += 2;
|
||||
$unpacked = @gzinflate(substr($data, $headerlen));
|
||||
if ($unpacked === FALSE) $unpacked = $data;
|
||||
return $unpacked;
|
||||
}
|
||||
|
||||
/**
|
||||
* 数字金额转换为中文
|
||||
* @param string|integer|float $num 目标数字
|
||||
* @param boolean $sim 使用小写(默认)
|
||||
* @return string
|
||||
*/
|
||||
public static function number2chinese($num, $sim = FALSE)
|
||||
{
|
||||
if (!is_numeric($num)) return '含有非数字非小数点字符!';
|
||||
$char = $sim ? array('零', '一', '二', '三', '四', '五', '六', '七', '八', '九') : array('零', '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖');
|
||||
$unit = $sim ? array('', '十', '百', '千', '', '万', '亿', '兆') : array('', '拾', '佰', '仟', '', '萬', '億', '兆');
|
||||
$retval = '';
|
||||
|
||||
$num = sprintf("%01.2f", $num);
|
||||
|
||||
list ($num, $dec) = explode('.', $num);
|
||||
|
||||
// 小数部分
|
||||
if ($dec['0'] > 0)
|
||||
{
|
||||
$retval .= "{$char[$dec['0']]}角";
|
||||
}
|
||||
if ($dec['1'] > 0)
|
||||
{
|
||||
$retval .= "{$char[$dec['1']]}分";
|
||||
}
|
||||
|
||||
// 整数部分
|
||||
if ($num > 0)
|
||||
{
|
||||
$retval = "元" . $retval;
|
||||
$f = 1;
|
||||
$str = strrev(intval($num));
|
||||
for ($i = 0, $c = strlen($str); $i < $c; $i ++)
|
||||
{
|
||||
if ($str[$i] > 0)
|
||||
{
|
||||
$f = 0;
|
||||
}
|
||||
if ($f == 1 && $str[$i] == 0)
|
||||
{
|
||||
$out[$i] = "";
|
||||
}
|
||||
else
|
||||
{
|
||||
$out[$i] = $char[$str[$i]];
|
||||
}
|
||||
$out[$i] .= $str[$i] != '0' ? $unit[$i % 4] : '';
|
||||
if ($i > 1 and $str[$i] + $str[$i - 1] == 0)
|
||||
{
|
||||
$out[$i] = '';
|
||||
}
|
||||
if ($i % 4 == 0)
|
||||
{
|
||||
$out[$i] .= $unit[4 + floor($i / 4)];
|
||||
}
|
||||
}
|
||||
$retval = join('', array_reverse($out)) . $retval;
|
||||
}
|
||||
return $retval;
|
||||
}
|
||||
|
||||
public static function colorize($str, $status = "info")
|
||||
{
|
||||
$out = "";
|
||||
switch ($status)
|
||||
{
|
||||
case 'succ':
|
||||
$out = "\033[32m"; // Blue
|
||||
break;
|
||||
case "error":
|
||||
$out = "\033[31m"; // Red
|
||||
break;
|
||||
case "warn":
|
||||
$out = "\033[33m"; // Yellow
|
||||
break;
|
||||
case "note":
|
||||
$out = "\033[34m"; // Green
|
||||
break;
|
||||
case "debug":
|
||||
$out = "\033[36m"; // Green
|
||||
break;
|
||||
default:
|
||||
$out = "\033[0m"; // info
|
||||
break;
|
||||
}
|
||||
return $out.$str."\033[0m";
|
||||
}
|
||||
|
||||
public static function node_to_array($dom, $node)
|
||||
{
|
||||
if(!is_a( $dom, 'DOMDocument' ) || !is_a( $node, 'DOMNode' ))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
$array = array();
|
||||
// Discard empty nodes
|
||||
$localName = trim( $node->localName );
|
||||
if( empty($localName))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if( XML_TEXT_NODE == $node->nodeType )
|
||||
{
|
||||
return $node->nodeValue;
|
||||
}
|
||||
foreach ($node->attributes as $attr)
|
||||
{
|
||||
$array['@'.$attr->localName] = $attr->nodeValue;
|
||||
}
|
||||
foreach ($node->childNodes as $childNode)
|
||||
{
|
||||
if ( (isset($childNode->childNodes->length) && 1 == $childNode->childNodes->length) &&
|
||||
XML_TEXT_NODE == $childNode->firstChild->nodeType )
|
||||
{
|
||||
$array[$childNode->localName] = $childNode->nodeValue;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( false !== ($a = self::node_to_array( $dom, $childNode)))
|
||||
{
|
||||
$array[$childNode->localName] = $a;
|
||||
}
|
||||
}
|
||||
}
|
||||
return $array;
|
||||
}
|
||||
|
||||
public static function is_win()
|
||||
{
|
||||
return strtoupper(substr(PHP_OS,0,3))==="WIN";
|
||||
}
|
||||
|
||||
/**
|
||||
* 和 http_build_query 相反,分解出参数
|
||||
*
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-05-16 17:29
|
||||
*/
|
||||
public static function http_split_query($query, $is_query = false)
|
||||
{
|
||||
if (!$is_query)
|
||||
{
|
||||
$parse_arr = parse_url($query);
|
||||
if (empty($parse_arr['query']))
|
||||
{
|
||||
return array();
|
||||
}
|
||||
$query = $parse_arr['query'];
|
||||
}
|
||||
|
||||
$query_arr = explode("&", $query);
|
||||
$params = array();
|
||||
foreach ($query_arr as $val)
|
||||
{
|
||||
$arr = explode("=", $val);
|
||||
$params[$arr[0]] = $arr[1];
|
||||
}
|
||||
return $params;
|
||||
}
|
||||
}
|
||||
|
||||
|
421
vendor/owner888/phpspider/core/worker.php
vendored
Normal file
421
vendor/owner888/phpspider/core/worker.php
vendored
Normal file
@ -0,0 +1,421 @@
|
||||
<?php
|
||||
// +----------------------------------------------------------------------
|
||||
// | PHPSpider [ A PHP Framework For Crawler ]
|
||||
// +----------------------------------------------------------------------
|
||||
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
|
||||
// +----------------------------------------------------------------------
|
||||
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
|
||||
// +----------------------------------------------------------------------
|
||||
// | Author: Seatle Yang <seatle@foxmail.com>
|
||||
// +----------------------------------------------------------------------
|
||||
|
||||
//----------------------------------
|
||||
// Worker多进程操作类
|
||||
//----------------------------------
|
||||
|
||||
class worker
|
||||
{
|
||||
// worker进程数
|
||||
public $count = 0;
|
||||
// worker id,worker进程从1开始,0被master进程所使用
|
||||
public $worker_id = 0;
|
||||
// worker 进程ID
|
||||
public $worker_pid = 0;
|
||||
// 进程用户
|
||||
public $user = '';
|
||||
// 进程名
|
||||
public $title = '';
|
||||
// 每个进程是否只运行一次
|
||||
public $run_once = true;
|
||||
// 是否输出日志
|
||||
public $log_show = false;
|
||||
// master进程启动回调
|
||||
public $on_start = false;
|
||||
// master进程停止回调
|
||||
public $on_stop = false;
|
||||
// worker进程启动回调
|
||||
public $on_worker_start = false;
|
||||
// worker进程停止回调
|
||||
public $on_worker_stop = false;
|
||||
// master进程ID
|
||||
protected static $_master_pid = 0;
|
||||
// worker进程ID
|
||||
protected static $_worker_pids = array();
|
||||
// master、worker进程启动时间
|
||||
public $time_start = 0;
|
||||
// master、worker进程运行状态 [starting|running|shutdown|reload]
|
||||
protected static $_status = "starting";
|
||||
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
self::$_master_pid = posix_getpid();
|
||||
// 产生时钟云,添加后父进程才可以收到信号
|
||||
declare(ticks = 1);
|
||||
$this->install_signal();
|
||||
}
|
||||
|
||||
/**
|
||||
* 安装信号处理函数
|
||||
* @return void
|
||||
*/
|
||||
protected function install_signal()
|
||||
{
|
||||
// stop
|
||||
pcntl_signal(SIGINT, array($this, 'signal_handler'), false);
|
||||
// reload
|
||||
pcntl_signal(SIGUSR1, array($this, 'signal_handler'), false);
|
||||
// status
|
||||
pcntl_signal(SIGUSR2, array($this, 'signal_handler'), false);
|
||||
// ignore
|
||||
pcntl_signal(SIGPIPE, SIG_IGN, false);
|
||||
// install signal handler for dead kids
|
||||
// pcntl_signal(SIGCHLD, array($this, 'signal_handler'));
|
||||
}
|
||||
|
||||
/**
|
||||
* 卸载信号处理函数
|
||||
* @return void
|
||||
*/
|
||||
protected function uninstall_signal()
|
||||
{
|
||||
// uninstall stop signal handler
|
||||
pcntl_signal(SIGINT, SIG_IGN, false);
|
||||
// uninstall reload signal handler
|
||||
pcntl_signal(SIGUSR1, SIG_IGN, false);
|
||||
// uninstall status signal handler
|
||||
pcntl_signal(SIGUSR2, SIG_IGN, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* 信号处理函数,会被其他类调用到,所以要设置为public
|
||||
* @param int $signal
|
||||
*/
|
||||
public function signal_handler($signal) {
|
||||
switch ($signal) {
|
||||
// stop 2
|
||||
case SIGINT:
|
||||
// master进程和worker进程都会调用
|
||||
$this->stop_all();
|
||||
break;
|
||||
// reload 30
|
||||
case SIGUSR1:
|
||||
echo "reload\n";
|
||||
break;
|
||||
// show status 31
|
||||
case SIGUSR2:
|
||||
echo "status\n";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 运行worker实例
|
||||
*/
|
||||
public function run()
|
||||
{
|
||||
$this->time_start = microtime(true);
|
||||
$this->worker_id = 0;
|
||||
$this->worker_pid = posix_getpid();
|
||||
$this->set_process_title($this->title);
|
||||
|
||||
// 这里赋值,worker进程也会克隆到
|
||||
if ($this->log_show)
|
||||
{
|
||||
log::$log_show = true;
|
||||
}
|
||||
|
||||
if ($this->on_start)
|
||||
{
|
||||
call_user_func($this->on_start, $this);
|
||||
}
|
||||
|
||||
// worker进程从1开始,0被master进程所使用
|
||||
for ($i = 1; $i <= $this->count; $i++)
|
||||
{
|
||||
$this->fork_one_worker($i);
|
||||
}
|
||||
$this->monitor_workers();
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建一个子进程
|
||||
* @param Worker $worker
|
||||
* @throws Exception
|
||||
*/
|
||||
public function fork_one_worker($worker_id)
|
||||
{
|
||||
//$sockets = stream_socket_pair(STREAM_PF_UNIX, STREAM_SOCK_STREAM, STREAM_IPPROTO_IP);
|
||||
$pid = pcntl_fork();
|
||||
|
||||
// 主进程记录子进程pid
|
||||
if($pid > 0)
|
||||
{
|
||||
self::$_worker_pids[$worker_id] = $pid;
|
||||
}
|
||||
// 子进程运行
|
||||
elseif(0 === $pid)
|
||||
{
|
||||
$this->time_start = microtime(true);
|
||||
$this->worker_id = $worker_id;
|
||||
$this->worker_pid = posix_getpid();
|
||||
$this->set_process_title($this->title);
|
||||
$this->set_process_user($this->user);
|
||||
// 清空master进程克隆过来的worker进程ID
|
||||
self::$_worker_pids = array();
|
||||
//$this->uninstall_signal();
|
||||
|
||||
// 设置worker进程的运行状态为运行中
|
||||
self::$_status = "running";
|
||||
|
||||
// 注册进程退出回调,用来检查是否有错误(子进程里面注册)
|
||||
register_shutdown_function(array($this, 'check_errors'));
|
||||
|
||||
// 如果设置了worker进程启动回调函数
|
||||
if ($this->on_worker_start)
|
||||
{
|
||||
call_user_func($this->on_worker_start, $this);
|
||||
}
|
||||
|
||||
// 停止当前worker实例
|
||||
$this->stop();
|
||||
// 这里用0表示正常退出
|
||||
exit(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
log::add("fork one worker fail", "Error");
|
||||
exit;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 尝试设置运行当前进程的用户
|
||||
*
|
||||
* @param $user_name
|
||||
*/
|
||||
protected static function set_process_user($user_name)
|
||||
{
|
||||
// 用户名为空 或者 当前用户不是root用户
|
||||
if(empty($user_name) || posix_getuid() !== 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
$user_info = posix_getpwnam($user_name);
|
||||
if($user_info['uid'] != posix_getuid() || $user_info['gid'] != posix_getgid())
|
||||
{
|
||||
if(!posix_setgid($user_info['gid']) || !posix_setuid($user_info['uid']))
|
||||
{
|
||||
log::add('Can not run woker as '.$user_name." , You shuld be root", "Error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置当前进程的名称,在ps aux命令中有用
|
||||
* 注意 需要php>=5.5或者安装了protitle扩展
|
||||
* @param string $title
|
||||
* @return void
|
||||
*/
|
||||
protected function set_process_title($title)
|
||||
{
|
||||
if (!empty($title))
|
||||
{
|
||||
// 需要扩展
|
||||
if(extension_loaded('proctitle') && function_exists('setproctitle'))
|
||||
{
|
||||
@setproctitle($title);
|
||||
}
|
||||
// >=php 5.5
|
||||
elseif (function_exists('cli_set_process_title'))
|
||||
{
|
||||
cli_set_process_title($title);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 监控所有子进程的退出事件及退出码
|
||||
* @return void
|
||||
*/
|
||||
public function monitor_workers()
|
||||
{
|
||||
// 设置master进程的运行状态为运行中
|
||||
self::$_status = "running";
|
||||
while(1)
|
||||
{
|
||||
// pcntl_signal_dispatch 子进程无法接受到信号
|
||||
// 如果有信号到来,尝试触发信号处理函数
|
||||
//pcntl_signal_dispatch();
|
||||
// 挂起进程,直到有子进程退出或者被信号打断
|
||||
$status = 0;
|
||||
$pid = pcntl_wait($status, WUNTRACED);
|
||||
// 如果有信号到来,尝试触发信号处理函数
|
||||
//pcntl_signal_dispatch();
|
||||
|
||||
// 子进程退出信号
|
||||
if($pid > 0)
|
||||
{
|
||||
//echo "worker[".$pid."] stop\n";
|
||||
//$this->stop();
|
||||
|
||||
// 如果不是正常退出,是被kill等杀掉的
|
||||
if($status !== 0)
|
||||
{
|
||||
log::add("worker {$pid} exit with status $status", "Warning");
|
||||
}
|
||||
|
||||
// key 和 value 互换
|
||||
$worker_pids = array_flip(self::$_worker_pids);
|
||||
// 通过 pid 得到 worker_id
|
||||
$worker_id = $worker_pids[$pid];
|
||||
// 这里不unset掉,是为了进程重启
|
||||
self::$_worker_pids[$worker_id] = 0;
|
||||
//unset(self::$_worker_pids[$pid]);
|
||||
|
||||
// 再生成一个worker
|
||||
if (!$this->run_once)
|
||||
{
|
||||
$this->fork_one_worker($worker_id);
|
||||
}
|
||||
|
||||
// 如果所有子进程都退出了,触发主进程退出函数
|
||||
$all_worker_stop = true;
|
||||
foreach (self::$_worker_pids as $_worker_pid)
|
||||
{
|
||||
// 只要有一个worker进程还存在进程ID,就不算退出
|
||||
if ($_worker_pid != 0)
|
||||
{
|
||||
$all_worker_stop = false;
|
||||
}
|
||||
}
|
||||
if ($all_worker_stop)
|
||||
{
|
||||
if ($this->on_stop)
|
||||
{
|
||||
call_user_func($this->on_stop, $this);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
// 其他信号
|
||||
else
|
||||
{
|
||||
// worker进程接受到master进行信号退出的,会到这里来
|
||||
if ($this->on_stop)
|
||||
{
|
||||
call_user_func($this->on_stop, $this);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行关闭流程(所有进程)
|
||||
* 事件触发,非正常程序执行完毕
|
||||
* @return void
|
||||
*/
|
||||
public function stop_all()
|
||||
{
|
||||
// 设置master、worker进程的运行状态为关闭状态
|
||||
self::$_status = "shutdown";
|
||||
// master进程
|
||||
if(self::$_master_pid === posix_getpid())
|
||||
{
|
||||
// 循环给worker进程发送关闭信号
|
||||
foreach (self::$_worker_pids as $worker_pid)
|
||||
{
|
||||
posix_kill($worker_pid, SIGINT);
|
||||
}
|
||||
}
|
||||
// worker进程
|
||||
else
|
||||
{
|
||||
// 接收到master进程发送的关闭信号之后退出,这里应该考虑业务的完整性,不能强行exit
|
||||
$this->stop();
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 停止当前worker实例
|
||||
* 正常运行结束和接受信号退出,都会调用这个方法
|
||||
* @return void
|
||||
*/
|
||||
public function stop()
|
||||
{
|
||||
if ($this->on_worker_stop)
|
||||
{
|
||||
call_user_func($this->on_worker_stop, $this);
|
||||
}
|
||||
// 设置worker进程的运行状态为关闭
|
||||
self::$_status = "shutdown";
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查错误,PHP exit之前会执行
|
||||
* @return void
|
||||
*/
|
||||
public function check_errors()
|
||||
{
|
||||
// 如果当前worker进程不是正常退出
|
||||
if(self::$_status != "shutdown")
|
||||
{
|
||||
$error_msg = "WORKER EXIT UNEXPECTED ";
|
||||
$errors = error_get_last();
|
||||
if($errors && ($errors['type'] === E_ERROR ||
|
||||
$errors['type'] === E_PARSE ||
|
||||
$errors['type'] === E_CORE_ERROR ||
|
||||
$errors['type'] === E_COMPILE_ERROR ||
|
||||
$errors['type'] === E_RECOVERABLE_ERROR ))
|
||||
{
|
||||
$error_msg .= $this->get_error_type($errors['type']) . " {$errors['message']} in {$errors['file']} on line {$errors['line']}";
|
||||
}
|
||||
log::add($error_msg, 'Error');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取错误类型对应的意义
|
||||
* @param integer $type
|
||||
* @return string
|
||||
*/
|
||||
protected function get_error_type($type)
|
||||
{
|
||||
switch($type)
|
||||
{
|
||||
case E_ERROR: // 1 //
|
||||
return 'E_ERROR';
|
||||
case E_WARNING: // 2 //
|
||||
return 'E_WARNING';
|
||||
case E_PARSE: // 4 //
|
||||
return 'E_PARSE';
|
||||
case E_NOTICE: // 8 //
|
||||
return 'E_NOTICE';
|
||||
case E_CORE_ERROR: // 16 //
|
||||
return 'E_CORE_ERROR';
|
||||
case E_CORE_WARNING: // 32 //
|
||||
return 'E_CORE_WARNING';
|
||||
case E_COMPILE_ERROR: // 64 //
|
||||
return 'E_COMPILE_ERROR';
|
||||
case E_COMPILE_WARNING: // 128 //
|
||||
return 'E_COMPILE_WARNING';
|
||||
case E_USER_ERROR: // 256 //
|
||||
return 'E_USER_ERROR';
|
||||
case E_USER_WARNING: // 512 //
|
||||
return 'E_USER_WARNING';
|
||||
case E_USER_NOTICE: // 1024 //
|
||||
return 'E_USER_NOTICE';
|
||||
case E_STRICT: // 2048 //
|
||||
return 'E_STRICT';
|
||||
case E_RECOVERABLE_ERROR: // 4096 //
|
||||
return 'E_RECOVERABLE_ERROR';
|
||||
case E_DEPRECATED: // 8192 //
|
||||
return 'E_DEPRECATED';
|
||||
case E_USER_DEPRECATED: // 16384 //
|
||||
return 'E_USER_DEPRECATED';
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
20
vendor/owner888/phpspider/gitadd.sh
vendored
Normal file
20
vendor/owner888/phpspider/gitadd.sh
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
if [ ! -d "$1" ] && [ ! -f "$1" ]; then
|
||||
echo "file $1 not exists"
|
||||
exit
|
||||
fi
|
||||
filename=$1
|
||||
|
||||
comment="add file"
|
||||
if [[ $2 != "" ]]; then
|
||||
comment=$2
|
||||
fi
|
||||
|
||||
echo "start update..."
|
||||
git pull
|
||||
echo "start add new file..."
|
||||
git add $filename
|
||||
echo "start commit..."
|
||||
git commit -m "$comment" $filename
|
||||
git push -u origin master
|
||||
echo "git commit complete..."
|
129
vendor/owner888/phpspider/hacked-emails/banners.txt
vendored
Normal file
129
vendor/owner888/phpspider/hacked-emails/banners.txt
vendored
Normal file
@ -0,0 +1,129 @@
|
||||
|
||||
_-o#&&*''''?d:>b\_
|
||||
_o/"`'' '',, dMF9MMMMMHo_
|
||||
.o&#' `"MbHMMMMMMMMMMMHo.
|
||||
.o"" ' vodM*$&&HMMMMMMMMMM?.
|
||||
,' $M&ood,~'`(&##MMMMMMH\
|
||||
/ ,MMMMMMM#b?#bobMMMMHMMML
|
||||
& ?MMMMMMMMMMMMMMMMM7MMM$R*Hk
|
||||
?$. :MMMMMMMMMMMMMMMMMMM/HMMM|`*L
|
||||
| |MMMMMMMMMMMMMMMMMMMMbMH' T,
|
||||
$H#: `*MMMMMMMMMMMMMMMMMMMMb#]' `?
|
||||
]MMH# ""*""""*#MMMMMMMMMMMMM' -
|
||||
MMMMMb_ |MMMMMMMMMMMP' :
|
||||
HMMMMMMMHo `MMMMMMMMMT .
|
||||
?MMMMMMMMP 9MMMMMMMM] -
|
||||
-?MMMMMMM |MMMMMMMMM?,d- ' {Name}
|
||||
:|MMMMMM- `MMMMMMMT .M|. : {Description}
|
||||
.9MMM[ &MMMMM*' `' . {Loaded}
|
||||
:9MMk `MMM#" -
|
||||
&M] ` .-
|
||||
`&. .
|
||||
`~, . ./
|
||||
. _ .-
|
||||
'`--._,dd###pp=""'
|
||||
|
||||
$$$$$AnyShIt$$$$$$
|
||||
|
||||
_v->#H#P? "':o<>\_
|
||||
.,dP` `'' "'-o.+H6&MMMHo_
|
||||
oHMH9' `?&bHMHMMMMMMHo.
|
||||
oMP"' ' ooMP*#&HMMMMMMM?.
|
||||
,M* - `*MSdob//`^&##MMMH\
|
||||
d*' .,MMMMMMH#o>#ooMMMMMb
|
||||
HM- :HMMMMMMMMMMMMMMM&HM[R\
|
||||
d"Z\. 9MMMMMMMMMMMMMMMMM[HMM|:
|
||||
-H - MMMMMMMMMMMMMMMMMMMbMP' :
|
||||
:??Mb# `9MMMMMMMMMMMMMMMMMMH#! .
|
||||
: MMMMH#, "*""""`#HMMMMMMMMMMH -
|
||||
||MMMMMM6\. [MMMMMMMMMH' :
|
||||
:|MMMMMMMMMMHo `9MMMMMMMM' .
|
||||
. HMMMMMMMMMMP' !MMMMMMMM `
|
||||
- `#MMMMMMMMM HMMMMMMM*,/ :
|
||||
: ?MMMMMMMF HMMMMMM',P' : {Name}
|
||||
. HMMMMR' [MMMMP' ^' - {Description}
|
||||
: `HMMMT iMMH' .' {Loaded}
|
||||
-.`HMH .
|
||||
-:*H . '
|
||||
-`\,, . .-
|
||||
' . _ .-`
|
||||
'`~\.__,obb#q==~'''
|
||||
|
||||
$$$$$AnyShIt$$$$$$
|
||||
|
||||
_ood>H&H&Z?#M#b-\.
|
||||
.\HMMMMMR?`\M6b."`' ''``v.
|
||||
.. .MMMMMMMMMMHMMM#&. ``~o.
|
||||
. ,HMMMMMMMMMMMM*"'-` &b.
|
||||
. .MMMMMMMMMMMMH' `"&\
|
||||
- RMMMMM#H##R' 4Mb
|
||||
- |7MMM' ?:: `|MMb
|
||||
/ HMM__#|`"\>?v.. `MMML
|
||||
. `"'#Hd| ` 9MMM:
|
||||
- |\,\?HH#bbL `9MMb
|
||||
: !MMMMMMMH#b, `""T
|
||||
. . ,MMMMMMMMMMMbo. |
|
||||
: 4MMMMMMMMMMMMMMMHo |
|
||||
: ?MMMMMMMMMMMMMMM? :
|
||||
-. `#MMMMMMMMMMMM: .-
|
||||
: |MMMMMMMMMM? .
|
||||
- JMMMMMMMT' : {Name}
|
||||
`. MMMMMMH' - {Description}
|
||||
-. |MMM#*` - {Loaded}
|
||||
. HMH' . '
|
||||
-. #H:. .-
|
||||
` . .\ .-
|
||||
'-..-+oodHL_,--/-`
|
||||
|
||||
|
||||
$$$$$AnyShIt$$$$$$
|
||||
|
||||
.,:,#&6dHHHb&##o\_
|
||||
.oHHMMMMMMMMMMMMMMMMMH*\,.
|
||||
oHMMMMMMMMMMMMMMMMMMMMMMHb:'-.
|
||||
.dMMMMMMMMMMMMMMMMMMMMMMMMMH|\/' .
|
||||
,&HMMMMMMMMMMMMMMMMMMMMMMM/"&.,d. -.
|
||||
dboMMHMMMMMMMMMMMMMMMMMMMMMML `' .
|
||||
HMHMMM$Z***MMMMMMMMMMMMMMMMMM|.- .
|
||||
dMM]MMMM#' `9MMMH?"`MMMMR'T' _ :
|
||||
|MMMbM#'' |MM" ``MMMH. <_ .
|
||||
dMMMM#& *&. .?`*" .'&: .
|
||||
MMMMMH- `' -v/H .dD "' ' :
|
||||
MMMM* `*M: 4MM*::-!v,_ :
|
||||
MMMM `*?::" "'``"?9Mb::. :
|
||||
&MMM, `"'"'|"._ "?`| - :
|
||||
`MMM].H ,#dM[_H ..:
|
||||
9MMi`M: . .ooHMMMMMMM, ..
|
||||
9Mb `- 1MMMMMMMMMM| : {Name}
|
||||
?M |MM#*#MMMM* . {Description}
|
||||
-. ` |#"' ,' {Loaded}
|
||||
. -" v`
|
||||
-. .-
|
||||
- . . `
|
||||
'-*#d#HHMMMMHH#"-'
|
||||
|
||||
$$$$$AnyShIt$$$$$$
|
||||
|
||||
.-:?,Z?:&$dHH##b\_
|
||||
,:bqRMMMMMMMMMMMMMMMMMHo.
|
||||
.?HHHMMMMMMMMMMMMMMMMMMMMMMMHo.
|
||||
-o/*M9MMMMMMMMMMMMMMMMMMMMMMMMMMMv
|
||||
.:H\b\'|?#HHMMMMMMMMMMMMMMMMMMMMMM6?Z\
|
||||
.?MMMHbdbbodMMMMHMMMMMMMMMMMMMMMMMMMM\':
|
||||
:MMMMMMMMMMM7MMMMb?6P**#MMMMMMMMMMMMMMM_ :
|
||||
\MMMMMMMMMMMMb^MMMMMM? `*MMMM*"`MMMR<' . -
|
||||
.1MMMMMMMMMMMMMb]M#"" 9MR' `?MMb \. :
|
||||
-MMMMMMMMMMMMMMMH##|` *&. |`*' .\ .
|
||||
-?""*MMMMMMMMMMMMM' ' |?b ,]" :
|
||||
: MMMMMMMMMMH' `M_|M]r\?
|
||||
. `MMMMMMMMM' `$_:`'"H
|
||||
- TMMMMMMMM, '"``::
|
||||
: [MMMMMMMM| oH| .#M-
|
||||
: `9MMMMMM' .MP . ,oMMT
|
||||
. HMMMMP' `' ,MMMP {Name}
|
||||
- `MMH' HH9* {Description}
|
||||
'. ` ` .' {Loaded}
|
||||
- . '
|
||||
` . - .-
|
||||
` . .-
|
||||
' -==pHMMH##HH#"""
|
49
vendor/owner888/phpspider/hacked-emails/hacked_emails.php
vendored
Normal file
49
vendor/owner888/phpspider/hacked-emails/hacked_emails.php
vendored
Normal file
@ -0,0 +1,49 @@
|
||||
<?php
|
||||
ini_set("memory_limit", "10240M");
|
||||
require_once __DIR__ . '/../autoloader.php';
|
||||
use phpspider\core\requests;
|
||||
use phpspider\core\selector;
|
||||
|
||||
/* Do NOT delete this comment */
|
||||
/* 不要删除这段注释 */
|
||||
|
||||
hacked_emails::random_banner();
|
||||
exit;
|
||||
class hacked_emails
|
||||
{
|
||||
// Colors
|
||||
// green - yellow - blue - red - white - magenta - cyan - reset
|
||||
public static $color_g = "\033[92m";
|
||||
public static $color_y = "\033[93m";
|
||||
public static $color_b = "\033[94m";
|
||||
public static $color_r = "\033[91m";
|
||||
public static $color_w = "\033[0m";
|
||||
public static $color_m = "\x1b[35m";
|
||||
public static $color_c = "\x1b[36m";
|
||||
public static $end = "\x1b[39m";
|
||||
public static $bold = "\033[1m";
|
||||
|
||||
public static function random_banner()
|
||||
{
|
||||
$banners = file_get_contents("banners.txt");
|
||||
$banners = explode('$$$$$AnyShIt$$$$$$', $banners);
|
||||
$banner = $banners[count($banners)-1];
|
||||
$banner_to_print = self::$color_g;
|
||||
$banner_to_print .= $banner;
|
||||
$banner_to_print .= self::$end;
|
||||
|
||||
$name = self::$color_b."Hacked Emails By ".self::$bold."@seatle -".self::$color_m." V0.1".self::$color_g;
|
||||
$banner_to_print = str_replace("{Name}", $name, $banner_to_print);
|
||||
$description = self::$color_c."Know the dangers of email credentials reuse attacks.".self::$color_g;
|
||||
$banner_to_print = str_replace("{Description}", $description, $banner_to_print);
|
||||
$loaded = self::$color_b."Loaded ".self::$color_y."14".self::$color_b." website.".self::$color_g;
|
||||
$banner_to_print = str_replace("{Loaded}", $loaded, $banner_to_print);
|
||||
echo $banner_to_print;
|
||||
}
|
||||
}
|
||||
|
||||
$html = requests::get('http://www.qiushibaike.com/article/118914171');
|
||||
//echo $html;
|
||||
//exit;
|
||||
$data = selector::select($html, "div.author", "css");
|
||||
echo $data;
|
425
vendor/owner888/phpspider/library/cls_curl.php
vendored
Normal file
425
vendor/owner888/phpspider/library/cls_curl.php
vendored
Normal file
@ -0,0 +1,425 @@
|
||||
<?php
|
||||
/**
|
||||
* Worker多进程操作类
|
||||
*
|
||||
* Licensed under The MIT License
|
||||
* For full copyright and license information, please see the MIT-LICENSE.txt
|
||||
* Redistributions of files must retain the above copyright notice.
|
||||
*
|
||||
* @author seatle<seatle@foxmail.com>
|
||||
* @copyright seatle<seatle@foxmail.com>
|
||||
* @link http://www.epooll.com/
|
||||
* @license http://www.opensource.org/licenses/mit-license.php MIT License
|
||||
*/
|
||||
|
||||
class cls_curl
|
||||
{
|
||||
protected static $timeout = 10;
|
||||
protected static $ch = null;
|
||||
protected static $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
|
||||
protected static $http_raw = false;
|
||||
protected static $cookie = null;
|
||||
protected static $cookie_jar = null;
|
||||
protected static $cookie_file = null;
|
||||
protected static $referer = null;
|
||||
protected static $ip = null;
|
||||
protected static $proxy = null;
|
||||
protected static $headers = array();
|
||||
protected static $hosts = array();
|
||||
protected static $gzip = false;
|
||||
protected static $info = array();
|
||||
|
||||
/**
|
||||
* set timeout
|
||||
*
|
||||
* @param init $timeout
|
||||
* @return
|
||||
*/
|
||||
public static function set_timeout($timeout)
|
||||
{
|
||||
self::$timeout = $timeout;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置代理
|
||||
*
|
||||
* @param mixed $proxy
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public static function set_proxy($proxy)
|
||||
{
|
||||
self::$proxy = $proxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* set referer
|
||||
*
|
||||
*/
|
||||
public static function set_referer($referer)
|
||||
{
|
||||
self::$referer = $referer;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置 user_agent
|
||||
*
|
||||
* @param string $useragent
|
||||
* @return void
|
||||
*/
|
||||
public static function set_useragent($useragent)
|
||||
{
|
||||
self::$useragent = $useragent;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE
|
||||
*
|
||||
* @param string $cookie
|
||||
* @return void
|
||||
*/
|
||||
public static function set_cookie($cookie)
|
||||
{
|
||||
self::$cookie = $cookie;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE JAR
|
||||
*
|
||||
* @param string $cookie_jar
|
||||
* @return void
|
||||
*/
|
||||
public static function set_cookie_jar($cookie_jar)
|
||||
{
|
||||
self::$cookie_jar = $cookie_jar;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE FILE
|
||||
*
|
||||
* @param string $cookie_file
|
||||
* @return void
|
||||
*/
|
||||
public static function set_cookie_file($cookie_file)
|
||||
{
|
||||
self::$cookie_file = $cookie_file;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取内容的时候是不是连header也一起获取
|
||||
*
|
||||
* @param mixed $http_raw
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public static function set_http_raw($http_raw)
|
||||
{
|
||||
self::$http_raw = $http_raw;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置IP
|
||||
*
|
||||
* @param string $ip
|
||||
* @return void
|
||||
*/
|
||||
public static function set_ip($ip)
|
||||
{
|
||||
self::$ip = $ip;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Headers
|
||||
*
|
||||
* @param string $headers
|
||||
* @return void
|
||||
*/
|
||||
public static function set_headers($headers)
|
||||
{
|
||||
self::$headers = $headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Hosts
|
||||
*
|
||||
* @param string $hosts
|
||||
* @return void
|
||||
*/
|
||||
public static function set_hosts($hosts)
|
||||
{
|
||||
self::$hosts = $hosts;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Gzip
|
||||
*
|
||||
* @param string $hosts
|
||||
* @return void
|
||||
*/
|
||||
public static function set_gzip($gzip)
|
||||
{
|
||||
self::$gzip = $gzip;
|
||||
}
|
||||
|
||||
/**
|
||||
* 初始化 CURL
|
||||
*
|
||||
*/
|
||||
public static function init()
|
||||
{
|
||||
//if (empty ( self::$ch ))
|
||||
if (!is_resource ( self::$ch ))
|
||||
{
|
||||
self::$ch = curl_init ();
|
||||
curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true );
|
||||
curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout );
|
||||
curl_setopt( self::$ch, CURLOPT_HEADER, false );
|
||||
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
|
||||
curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5);
|
||||
// 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生
|
||||
curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true);
|
||||
}
|
||||
return self::$ch;
|
||||
}
|
||||
|
||||
/**
|
||||
* get
|
||||
*
|
||||
*
|
||||
*/
|
||||
public static function get($url, $fields = array())
|
||||
{
|
||||
self::init ();
|
||||
return self::http_request($url, 'get', $fields);
|
||||
}
|
||||
|
||||
/**
|
||||
* $fields 有三种类型:1、数组;2、http query;3、json
|
||||
* 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao'))
|
||||
* 前两种是普通的post,可以用$_POST方式获取
|
||||
* 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
|
||||
*
|
||||
* @param mixed $url
|
||||
* @param array $fields
|
||||
* @param mixed $proxy
|
||||
* @static
|
||||
* @access public
|
||||
* @return void
|
||||
*/
|
||||
public static function post($url, $fields = array())
|
||||
{
|
||||
self::init ();
|
||||
return self::http_request($url, 'post', $fields);
|
||||
}
|
||||
|
||||
public static function http_request($url, $type = 'get', $fields)
|
||||
{
|
||||
// 如果是 get 方式,直接拼凑一个 url 出来
|
||||
if (strtolower($type) == 'get' && !empty($fields))
|
||||
{
|
||||
$url = $url . (strpos($url,"?")===false ? "?" : "&") . http_build_query($fields);
|
||||
}
|
||||
|
||||
// 随机绑定 hosts,做负载均衡
|
||||
if (self::$hosts)
|
||||
{
|
||||
$parse_url = parse_url($url);
|
||||
$host = $parse_url['host'];
|
||||
$key = rand(0, count(self::$hosts)-1);
|
||||
$ip = self::$hosts[$key];
|
||||
$url = str_replace($host, $ip, $url);
|
||||
self::$headers = array_merge( array('Host:'.$host), self::$headers );
|
||||
}
|
||||
curl_setopt( self::$ch, CURLOPT_URL, $url );
|
||||
// 如果是 post 方式
|
||||
if (strtolower($type) == 'post')
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_POST, true );
|
||||
curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields );
|
||||
}
|
||||
if (self::$useragent)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
|
||||
}
|
||||
if (self::$cookie)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_COOKIE, self::$cookie );
|
||||
}
|
||||
if (self::$cookie_jar)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_COOKIEJAR, self::$cookie_jar );
|
||||
}
|
||||
if (self::$cookie_file)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_COOKIEFILE, self::$cookie_file );
|
||||
}
|
||||
if (self::$referer)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_REFERER, self::$referer );
|
||||
}
|
||||
if (self::$ip)
|
||||
{
|
||||
self::$headers = array_merge( array('CLIENT-IP:'.self::$ip, 'X-FORWARDED-FOR:'.self::$ip), self::$headers );
|
||||
}
|
||||
if (self::$headers)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_HTTPHEADER, self::$headers );
|
||||
}
|
||||
if (self::$gzip)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' );
|
||||
}
|
||||
if (self::$proxy)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_PROXY, self::$proxy );
|
||||
}
|
||||
if (self::$http_raw)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_HEADER, true );
|
||||
}
|
||||
|
||||
$data = curl_exec ( self::$ch );
|
||||
self::$info = curl_getinfo(self::$ch);
|
||||
if ($data === false)
|
||||
{
|
||||
//echo date("Y-m-d H:i:s"), ' Curl error: ' . curl_error( self::$ch ), "\n";
|
||||
}
|
||||
|
||||
// 关闭句柄
|
||||
curl_close( self::$ch );
|
||||
//$data = substr($data, 10);
|
||||
//$data = gzinflate($data);
|
||||
return $data;
|
||||
}
|
||||
|
||||
public static function get_info()
|
||||
{
|
||||
return self::$info;
|
||||
}
|
||||
|
||||
public static function get_http_code()
|
||||
{
|
||||
return self::$info['http_code'];
|
||||
}
|
||||
}
|
||||
|
||||
function classic_curl($urls, $delay)
|
||||
{
|
||||
$queue = curl_multi_init();
|
||||
$map = array();
|
||||
|
||||
foreach ($urls as $url)
|
||||
{
|
||||
// create cURL resources
|
||||
$ch = curl_init();
|
||||
|
||||
// 设置 URL 和 其他参数
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, 1);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
||||
curl_setopt($ch, CURLOPT_HEADER, 0);
|
||||
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
|
||||
|
||||
// 把当前 curl resources 加入到 curl_multi_init 队列
|
||||
curl_multi_add_handle($queue, $ch);
|
||||
$map[$url] = $ch;
|
||||
}
|
||||
|
||||
$active = null;
|
||||
|
||||
// execute the handles
|
||||
do {
|
||||
$mrc = curl_multi_exec($queue, $active);
|
||||
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
|
||||
|
||||
while ($active > 0 && $mrc == CURLM_OK) {
|
||||
while (curl_multi_exec($queue, $active) === CURLM_CALL_MULTI_PERFORM);
|
||||
// 这里 curl_multi_select 一直返回 -1,所以这里就死循环了,CPU就100%了
|
||||
if (curl_multi_select($queue, 0.5) != -1)
|
||||
{
|
||||
do {
|
||||
$mrc = curl_multi_exec($queue, $active);
|
||||
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
|
||||
}
|
||||
}
|
||||
|
||||
$responses = array();
|
||||
foreach ($map as $url=>$ch) {
|
||||
//$responses[$url] = callback(curl_multi_getcontent($ch), $delay);
|
||||
$responses[$url] = callback(curl_multi_getcontent($ch), $delay, $url);
|
||||
curl_multi_remove_handle($queue, $ch);
|
||||
curl_close($ch);
|
||||
}
|
||||
|
||||
curl_multi_close($queue);
|
||||
return $responses;
|
||||
}
|
||||
|
||||
function rolling_curl($urls, $delay)
|
||||
{
|
||||
$queue = curl_multi_init();
|
||||
$map = array();
|
||||
|
||||
foreach ($urls as $url) {
|
||||
$ch = curl_init();
|
||||
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
||||
curl_setopt($ch, CURLOPT_HEADER, 0);
|
||||
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
|
||||
$cookie = '_za=36643642-e546-4d60-a771-8af8dcfbd001; q_c1=a57a2b9f10964f909b8d8969febf3ab2|1437705596000|1437705596000; _xsrf=f0304fba4e44e1d008ec308d59bab029; cap_id="YWY1YmRmODlmZGVmNDc3MWJlZGFkZDg3M2E0M2Q5YjM=|1437705596|963518c454bb6f10d96775021c098c84e1e46f5a"; z_c0="QUFCQVgtRWZBQUFYQUFBQVlRSlZUVjR6NEZVUTgtRkdjTVc5UDMwZXRJZFdWZ2JaOWctNVhnPT0=|1438164574|aed6ef3707f246a7b64da4f1e8c089395d77ff2b"; __utma=51854390.1105113342.1437990174.1438160686.1438164116.10; __utmc=51854390; __utmz=51854390.1438134939.8.5.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/yangzetao; __utmv=51854390.100-1|2=registration_date=20131030=1^3=entry_date=20131030=1';
|
||||
curl_setopt($ch, CURLOPT_COOKIE, $cookie);
|
||||
$useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
|
||||
curl_setopt( $ch, CURLOPT_USERAGENT, $useragent );
|
||||
curl_setopt($ch, CURLOPT_ENCODING, 'gzip');
|
||||
|
||||
curl_multi_add_handle($queue, $ch);
|
||||
$map[(string) $ch] = $url;
|
||||
}
|
||||
|
||||
$responses = array();
|
||||
do {
|
||||
while (($code = curl_multi_exec($queue, $active)) == CURLM_CALL_MULTI_PERFORM) ;
|
||||
|
||||
if ($code != CURLM_OK) { break; }
|
||||
|
||||
// a request was just completed -- find out which one
|
||||
while ($done = curl_multi_info_read($queue)) {
|
||||
|
||||
// get the info and content returned on the request
|
||||
$info = curl_getinfo($done['handle']);
|
||||
$error = curl_error($done['handle']);
|
||||
$results = callback(curl_multi_getcontent($done['handle']), $delay, $map[(string) $done['handle']]);
|
||||
$responses[$map[(string) $done['handle']]] = compact('info', 'error', 'results');
|
||||
|
||||
// remove the curl handle that just completed
|
||||
curl_multi_remove_handle($queue, $done['handle']);
|
||||
curl_close($done['handle']);
|
||||
}
|
||||
|
||||
// Block for data in / output; error handling is done by curl_multi_exec
|
||||
if ($active > 0) {
|
||||
curl_multi_select($queue, 0.5);
|
||||
}
|
||||
|
||||
} while ($active);
|
||||
|
||||
curl_multi_close($queue);
|
||||
return $responses;
|
||||
}
|
||||
|
||||
function callback($data, $delay, $url) {
|
||||
//echo $data;
|
||||
//echo date("Y-m-d H:i:s", time()) . " --- " . $url . "\n";
|
||||
if (!empty($data))
|
||||
{
|
||||
file_put_contents("./html2/".md5($url).".html", $data);
|
||||
}
|
||||
// usleep模拟现实中比较负责的数据处理逻辑(如提取, 分词, 写入文件或数据库等)
|
||||
//usleep(1);
|
||||
//return compact('data', 'matches');
|
||||
}
|
||||
|
248
vendor/owner888/phpspider/library/cls_query.php
vendored
Normal file
248
vendor/owner888/phpspider/library/cls_query.php
vendored
Normal file
@ -0,0 +1,248 @@
|
||||
<?php
|
||||
class cls_query
|
||||
{
|
||||
private static $content;
|
||||
public static $debug = false;
|
||||
|
||||
public static function init($content)
|
||||
{
|
||||
self::$content = $content;
|
||||
}
|
||||
|
||||
public static function query($query, $attr = "html")
|
||||
{
|
||||
$nodes = self::get_nodes($query);
|
||||
$datas = self::get_datas($nodes, $attr);
|
||||
return $datas;
|
||||
}
|
||||
|
||||
protected static function is_char($char) {
|
||||
return extension_loaded('mbstring') ? mb_eregi('\w', $char) : preg_match('@\w@', $char);
|
||||
}
|
||||
|
||||
/**
|
||||
* 从xpath中得到节点
|
||||
*
|
||||
* @param mixed $xpath
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2015-08-08 15:52
|
||||
*/
|
||||
private static function get_nodes($query)
|
||||
{
|
||||
// 把一到多个空格 替换成 一个空格
|
||||
// 把 > 和 ~ 符号两边的空格去掉,因为没有用这两个符号,所以这里可以不这么做
|
||||
// ul>li.className
|
||||
$query = trim(
|
||||
preg_replace('@\s+@', ' ',
|
||||
preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query)
|
||||
)
|
||||
);
|
||||
|
||||
$nodes = array();
|
||||
if (! $query)
|
||||
{
|
||||
return $nodes;
|
||||
}
|
||||
|
||||
$query_arr = explode(" ", $query);
|
||||
foreach ($query_arr as $k=>$v)
|
||||
{
|
||||
$path = $k == 0 ? $v : $path.' '.$v;
|
||||
$node = array("path"=>(string)$path, "name"=>"", "id"=>"", "class"=>"", "other"=>array());
|
||||
// 如果存在内容选择器
|
||||
if (preg_match('@(.*?)\[(.*?)=[\'|"](.*?)[\'|"]\]@', $v, $matches) && !empty($matches[2]) && !empty($matches[3]))
|
||||
{
|
||||
// 把选择器过滤掉 [rel='topic']
|
||||
$v = $matches[1];
|
||||
$node['other'] = array(
|
||||
'key'=>$matches[2],
|
||||
'val'=>$matches[3],
|
||||
);
|
||||
}
|
||||
|
||||
// 如果存在 id
|
||||
$id_arr = explode("#", $v);
|
||||
$class_arr = explode(".", $v);
|
||||
if (count($id_arr) === 2)
|
||||
{
|
||||
$node['name'] = $id_arr[0];
|
||||
$node['id'] = $id_arr[1];
|
||||
}
|
||||
// 如果存在 class
|
||||
elseif (count($class_arr) === 2)
|
||||
{
|
||||
$node['name'] = $class_arr[0];
|
||||
$node['class'] = $class_arr[1];
|
||||
}
|
||||
// 如果没有样式
|
||||
else
|
||||
{
|
||||
$node['name'] = $v;
|
||||
}
|
||||
$nodes[] = $node;
|
||||
}
|
||||
//print_r($nodes);
|
||||
//exit;
|
||||
return $nodes;
|
||||
}
|
||||
|
||||
public static function get_datas($nodes, $attr = "html")
|
||||
{
|
||||
if (empty(self::$content))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
$node_datas = array();
|
||||
$count = count($nodes);
|
||||
// 循环所有节点
|
||||
foreach ($nodes as $i=>$node)
|
||||
{
|
||||
$is_last = $count == $i+1 ? true : false;
|
||||
// 第一次
|
||||
if ($i == 0)
|
||||
{
|
||||
$datas = array();
|
||||
$datas = self::get_node_datas($node, self::$content, $attr, $is_last);
|
||||
// 如果第一次都取不到数据,直接跳出循环
|
||||
if(!$datas)
|
||||
{
|
||||
break;
|
||||
}
|
||||
$node_datas[$nodes[$i]['path']] = $datas;
|
||||
}
|
||||
else
|
||||
{
|
||||
$datas = array();
|
||||
// 循环上一个节点的数组
|
||||
foreach ($node_datas[$nodes[$i-1]['path']] as $v)
|
||||
{
|
||||
$datas = array_merge( $datas, self::get_node_datas($node, trim($v), $attr, $is_last) );
|
||||
}
|
||||
$node_datas[$nodes[$i]['path']] = $datas;
|
||||
// 删除上一个节点,防止内存溢出,或者缓存到本地,再次使用?!
|
||||
unset($node_datas[$nodes[$i-1]['path']]);
|
||||
}
|
||||
}
|
||||
//print_r($datas);exit;
|
||||
// 从数组中弹出最后一个元素
|
||||
$node_datas = array_pop($node_datas);
|
||||
//print_r($node_datas);
|
||||
//exit;
|
||||
return $node_datas;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从节点中获取内容
|
||||
* $regex = '@<meta[^>]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i';
|
||||
*
|
||||
* @param mixed $node
|
||||
* @param mixed $content
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2015-08-08 15:52
|
||||
*/
|
||||
private static function get_node_datas($node, $content, $attr = "html", $is_last = false)
|
||||
{
|
||||
$node_datas = $datas = array();
|
||||
|
||||
if (!empty($node['id']))
|
||||
{
|
||||
if ($node['name'])
|
||||
$regex = '@<'.$node['name'].'[^>]+id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)</'.$node['name'].'>@is';
|
||||
else
|
||||
$regex = '@id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)<@is';
|
||||
}
|
||||
elseif (!empty($node['class']))
|
||||
{
|
||||
if ($node['name'])
|
||||
$regex = '@<'.$node['name'].'[^>]+class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)</'.$node['name'].'>@is';
|
||||
else
|
||||
$regex = '@class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)<@is';
|
||||
}
|
||||
else
|
||||
{
|
||||
// 这里为是么是*,0次到多次,因为有可能是 <li>
|
||||
$regex = '@<'.$node['name'].'[^>]*?>(.*?)</'.$node['name'].'>@is';
|
||||
}
|
||||
self::log("regex --- " . $regex);;
|
||||
preg_match_all($regex, $content, $matches);
|
||||
$all_datas = empty($matches[0]) ? array() : $matches[0];
|
||||
$html_datas = empty($matches[1]) ? array() : $matches[1];
|
||||
|
||||
// 过滤掉选择器对不上的
|
||||
foreach ($all_datas as $i=>$data)
|
||||
{
|
||||
// 如果有设置其他选择器,验证一下选择器
|
||||
if (!empty($node['other']))
|
||||
{
|
||||
$regex = '@'.$node['other']['key'].'=[\'|"]'.$node['other']['val'].'[\'|"]@is';
|
||||
self::log("regex other --- " . $regex);
|
||||
// 过滤器对不上的,跳过
|
||||
if (!preg_match($regex, $data, $matches))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// 获取节点的html内容
|
||||
if ($attr != "html" && $is_last)
|
||||
{
|
||||
$regex = '@'.$attr.'=[\'|"](.*?)[\'|"]@is';
|
||||
preg_match($regex, $data, $matches);
|
||||
$node_datas[] = empty($matches[1]) ? '' : trim($matches[1]);
|
||||
}
|
||||
// 获取节点属性名的值
|
||||
else
|
||||
{
|
||||
$node_datas[] = trim($html_datas[$i]);
|
||||
}
|
||||
}
|
||||
//echo " 11111 ========================================= \n";
|
||||
//print_r($node_datas);
|
||||
//echo " 22222 ========================================= \n\n\n";
|
||||
return $node_datas;
|
||||
}
|
||||
|
||||
/**
|
||||
* 记录日志
|
||||
* @param string $msg
|
||||
* @return void
|
||||
*/
|
||||
private static function log($msg)
|
||||
{
|
||||
$msg = "[".date("Y-m-d H:i:s")."] " . $msg . "\n";
|
||||
if (self::$debug)
|
||||
{
|
||||
echo $msg;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//$xpath = "ul.top-nav-dropdown li";
|
||||
//$xpath = "i.zg-icon";
|
||||
//print_r($nodes);
|
||||
//exit;
|
||||
// [^>]+ 不是>的字符重复一次到多次, ? 表示不贪婪
|
||||
// \s 表示空白字符
|
||||
// * 表示0次或者多次
|
||||
// + 表示1次或者多次
|
||||
//
|
||||
// 后向引用,表示表达式中,从左往右数,第一个左括号对应的括号内的内容。
|
||||
// \\0 表示整个表达式
|
||||
// \\1表示第1个表达式
|
||||
// \\2表示第2个表达式
|
||||
// $regex = '@<meta[^>]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i';
|
||||
//preg_match_all($regex, $content, $matches);
|
||||
//print_r($matches);
|
||||
//exit;
|
||||
|
||||
// 用法
|
||||
//$content = file_get_contents("./test.html");
|
||||
//$query = "ul#top-nav-profile-dropdown li a";
|
||||
//$query = "div#zh-profile-following-topic a.link[href='/topic/19550937']";
|
||||
//cls_query::init($content);
|
||||
//$list = cls_query::query($query, "href");
|
||||
//print_r($list);
|
||||
|
1263
vendor/owner888/phpspider/library/cls_redis.php
vendored
Normal file
1263
vendor/owner888/phpspider/library/cls_redis.php
vendored
Normal file
File diff suppressed because it is too large
Load Diff
121
vendor/owner888/phpspider/library/cls_redis_client.php
vendored
Normal file
121
vendor/owner888/phpspider/library/cls_redis_client.php
vendored
Normal file
@ -0,0 +1,121 @@
|
||||
<?php
|
||||
/**
|
||||
* redis 客户端
|
||||
* redis的协议可参考这个文章http://redis.cn/topics/protocol.html
|
||||
*
|
||||
* @version 2.7.0
|
||||
* @copyright 1997-2018 The PHP Group
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2018-01-03
|
||||
*/
|
||||
class cls_redis_client
|
||||
{
|
||||
private $redis_socket = false;
|
||||
//private $command = '';
|
||||
|
||||
public function __construct($host='127.0.0.1', $port=6379, $timeout = 3)
|
||||
{
|
||||
$this->redis_socket = stream_socket_client("tcp://".$host.":".$port, $errno, $errstr, $timeout);
|
||||
if ( !$this->redis_socket )
|
||||
{
|
||||
throw new Exception("{$errno} - {$errstr}");
|
||||
}
|
||||
}
|
||||
|
||||
public function __destruct()
|
||||
{
|
||||
fclose($this->redis_socket);
|
||||
}
|
||||
|
||||
public function __call($name, $args)
|
||||
{
|
||||
$crlf = "\r\n";
|
||||
array_unshift($args, $name);
|
||||
$command = '*' . count($args) . $crlf;
|
||||
foreach ($args as $arg)
|
||||
{
|
||||
$command .= '$' . strlen($arg) . $crlf . $arg . $crlf;
|
||||
}
|
||||
//echo $command."\n";
|
||||
$fwrite = fwrite($this->redis_socket, $command);
|
||||
if ($fwrite === FALSE || $fwrite <= 0)
|
||||
{
|
||||
throw new Exception('Failed to write entire command to stream');
|
||||
}
|
||||
return $this->read_response();
|
||||
}
|
||||
|
||||
private function read_response()
|
||||
{
|
||||
$reply = trim(fgets($this->redis_socket, 1024));
|
||||
switch (substr($reply, 0, 1))
|
||||
{
|
||||
case '-':
|
||||
throw new Exception(trim(substr($reply, 1)));
|
||||
break;
|
||||
case '+':
|
||||
$response = substr(trim($reply), 1);
|
||||
if ($response === 'OK')
|
||||
{
|
||||
$response = TRUE;
|
||||
}
|
||||
break;
|
||||
case '$':
|
||||
$response = NULL;
|
||||
if ($reply == '$-1')
|
||||
{
|
||||
break;
|
||||
}
|
||||
$read = 0;
|
||||
$size = intval(substr($reply, 1));
|
||||
if ($size > 0)
|
||||
{
|
||||
do
|
||||
{
|
||||
$block_size = ($size - $read) > 1024 ? 1024 : ($size - $read);
|
||||
$r = fread($this->redis_socket, $block_size);
|
||||
if ($r === FALSE)
|
||||
{
|
||||
throw new Exception('Failed to read response from stream');
|
||||
}
|
||||
else
|
||||
{
|
||||
$read += strlen($r);
|
||||
$response .= $r;
|
||||
}
|
||||
}
|
||||
while ($read < $size);
|
||||
}
|
||||
fread($this->redis_socket, 2); /* discard crlf */
|
||||
break;
|
||||
/* Multi-bulk reply */
|
||||
case '*':
|
||||
$count = intval(substr($reply, 1));
|
||||
if ($count == '-1')
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
$response = array();
|
||||
for ($i = 0; $i < $count; $i++)
|
||||
{
|
||||
$response[] = $this->read_response();
|
||||
}
|
||||
break;
|
||||
/* Integer reply */
|
||||
case ':':
|
||||
$response = intval(substr(trim($reply), 1));
|
||||
break;
|
||||
default:
|
||||
throw new RedisException("Unknown response: {$reply}");
|
||||
break;
|
||||
}
|
||||
return $response;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//$redis = new cls_redis_client();
|
||||
//var_dump($redis->auth("foobared"));
|
||||
//var_dump($redis->set("name",'abc'));
|
||||
//var_dump($redis->get("name"));
|
||||
|
179
vendor/owner888/phpspider/library/cls_redis_server.php
vendored
Normal file
179
vendor/owner888/phpspider/library/cls_redis_server.php
vendored
Normal file
@ -0,0 +1,179 @@
|
||||
<?php
|
||||
ini_set("memory_limit", "128M");
|
||||
/**
|
||||
* redis 服务端
|
||||
* 多进程阻塞式
|
||||
* redis-benchmark -h 127.0.0.1 -p 11211 -t set -n 80000 -q
|
||||
*
|
||||
* @version 2.7.0
|
||||
* @copyright 1997-2018 The PHP Group
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2018-01-03
|
||||
*/
|
||||
class cls_redis_server
|
||||
{
|
||||
private $socket = false;
|
||||
private $process_num = 3;
|
||||
public $redis_kv_data = array();
|
||||
public $onMessage = null;
|
||||
|
||||
public function __construct($host="0.0.0.0", $port=6379)
|
||||
{
|
||||
$this->socket = stream_socket_server("tcp://".$host.":".$port,$errno, $errstr);
|
||||
if (!$this->socket) die($errstr."--".$errno);
|
||||
echo "listen $host $port \r\n";
|
||||
}
|
||||
|
||||
private function parse_resp(&$conn)
|
||||
{
|
||||
// 读取一行,遇到 \r\n 为一行
|
||||
$line = fgets($conn);
|
||||
if($line === '' || $line === false)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
// 获取第一个字符作为类型
|
||||
$type = $line[0];
|
||||
// 去掉第一个字符,去掉结尾的 \r\n
|
||||
$line = mb_substr($line, 1, -2);
|
||||
switch ( $type )
|
||||
{
|
||||
case "*":
|
||||
// 得到长度
|
||||
$count = (int) $line;
|
||||
$data = array();
|
||||
for ($i = 1; $i <= $count; $i++)
|
||||
{
|
||||
$data[] = $this->parse_resp($conn);
|
||||
}
|
||||
return $data;
|
||||
case "$":
|
||||
if ($line == '-1')
|
||||
{
|
||||
return null;
|
||||
}
|
||||
// 截取的长度要加上 \r\n 两个字符
|
||||
$length = $line + 2;
|
||||
$data = '';
|
||||
while ($length > 0)
|
||||
{
|
||||
$block = fread($conn, $length);
|
||||
if ($length !== strlen($block))
|
||||
{
|
||||
throw new Exception('RECEIVING');
|
||||
}
|
||||
$data .= $block;
|
||||
$length -= mb_strlen($block);
|
||||
}
|
||||
return mb_substr($data, 0, -2);
|
||||
}
|
||||
return $line;
|
||||
}
|
||||
|
||||
private function start_worker_process()
|
||||
{
|
||||
$pid = pcntl_fork();
|
||||
switch ($pid)
|
||||
{
|
||||
case -1:
|
||||
echo "fork error : {$i} \r\n";
|
||||
exit;
|
||||
case 0:
|
||||
while ( true )
|
||||
{
|
||||
echo "PID ".posix_getpid()." waiting...\n";
|
||||
// 堵塞等待
|
||||
$conn = stream_socket_accept($this->socket, -1);
|
||||
if ( !$conn )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
//"*3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$7\r\nmyvalue\r\n"
|
||||
while( true )
|
||||
{
|
||||
$arr = $this->parse_resp($conn);
|
||||
if ( is_array($arr) )
|
||||
{
|
||||
if ($this->onMessage)
|
||||
{
|
||||
call_user_func($this->onMessage, $conn, $arr);
|
||||
}
|
||||
}
|
||||
else if ( $arr )
|
||||
{
|
||||
if ($this->onMessage)
|
||||
{
|
||||
call_user_func($this->onMessage, $conn, $arr);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fclose($conn);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
default:
|
||||
$this->pids[$pid] = $pid;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public function run()
|
||||
{
|
||||
for($i = 1; $i <= $this->process_num; $i++)
|
||||
{
|
||||
$this->start_worker_process();
|
||||
}
|
||||
|
||||
while( true )
|
||||
{
|
||||
foreach ($this->pids as $i => $pid)
|
||||
{
|
||||
if($pid)
|
||||
{
|
||||
$res = pcntl_waitpid($pid, $status,WNOHANG);
|
||||
|
||||
if ( $res == -1 || $res > 0 )
|
||||
{
|
||||
$this->start_worker_process();
|
||||
unset($this->pids[$pid]);
|
||||
}
|
||||
}
|
||||
}
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
$server = new cls_redis_server();
|
||||
$server->onMessage = function($conn, $info) use($server)
|
||||
{
|
||||
if ( is_array($info) )
|
||||
{
|
||||
$command = strtoupper($info[0]);
|
||||
if ( $command == "SET" )
|
||||
{
|
||||
$key = $info[1];
|
||||
$val = $info[2];
|
||||
$server->redis_kv_data[$key] = $val;
|
||||
fwrite($conn, "+OK\r\n");
|
||||
}
|
||||
else if ( $command == "GET" )
|
||||
{
|
||||
$key = $info[1];
|
||||
$val = isset($server->redis_kv_data[$key]) ? $server->redis_kv_data[$key] : '';
|
||||
fwrite($conn, "$".strlen($val)."\r\n".$val."\r\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
fwrite($conn,"+OK\r\n");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fwrite($conn,"+OK\r\n");
|
||||
}
|
||||
};
|
||||
$server->run();
|
5727
vendor/owner888/phpspider/library/phpquery.php
vendored
Normal file
5727
vendor/owner888/phpspider/library/phpquery.php
vendored
Normal file
File diff suppressed because it is too large
Load Diff
466
vendor/owner888/phpspider/library/rolling_curl.php
vendored
Normal file
466
vendor/owner888/phpspider/library/rolling_curl.php
vendored
Normal file
@ -0,0 +1,466 @@
|
||||
<?php
|
||||
/**
|
||||
* Curl操作类
|
||||
*
|
||||
* Licensed under The MIT License
|
||||
* For full copyright and license information, please see the MIT-LICENSE.txt
|
||||
* Redistributions of files must retain the above copyright notice.
|
||||
*
|
||||
* @author seatle<seatle@foxmail.com>
|
||||
* @copyright seatle<seatle@foxmail.com>
|
||||
* @link http://www.epooll.com/
|
||||
* @license http://www.opensource.org/licenses/mit-license.php MIT License
|
||||
*/
|
||||
|
||||
class rolling_curl
|
||||
{
|
||||
/**
|
||||
* @var float
|
||||
*
|
||||
* 同时运行任务数
|
||||
* 例如:有8个请求,则会被分成两批,第一批5个请求,第二批3个请求
|
||||
* 注意:采集知乎的时候,5个是比较稳定的,7个以上就开始会超时了,多进程就没有这样的问题,因为多进程很少几率会发生并发
|
||||
*/
|
||||
public $window_size = 5;
|
||||
|
||||
/**
|
||||
* @var float
|
||||
*
|
||||
* Timeout is the timeout used for curl_multi_select.
|
||||
*/
|
||||
private $timeout = 10;
|
||||
|
||||
/**
|
||||
* @var string|array
|
||||
*
|
||||
* 应用在每个请求的回调函数
|
||||
*/
|
||||
public $callback;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*
|
||||
* 设置默认的请求参数
|
||||
*/
|
||||
protected $options = array(
|
||||
CURLOPT_SSL_VERIFYPEER => 0,
|
||||
CURLOPT_RETURNTRANSFER => 1,
|
||||
// 注意:TIMEOUT = CONNECTTIMEOUT + 数据获取时间,所以 TIMEOUT 一定要大于 CONNECTTIMEOUT,否则 CONNECTTIMEOUT 设置了就没意义
|
||||
// "Connection timed out after 30001 milliseconds"
|
||||
CURLOPT_CONNECTTIMEOUT => 30,
|
||||
CURLOPT_TIMEOUT => 60,
|
||||
CURLOPT_RETURNTRANSFER => 1,
|
||||
CURLOPT_HEADER => 0,
|
||||
// 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生
|
||||
CURLOPT_NOSIGNAL => 1,
|
||||
CURLOPT_USERAGENT => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36",
|
||||
);
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $headers = array();
|
||||
|
||||
/**
|
||||
* @var Request[]
|
||||
*
|
||||
* 请求队列
|
||||
*/
|
||||
private $requests = array();
|
||||
|
||||
/**
|
||||
* @var RequestMap[]
|
||||
*
|
||||
* Maps handles to request indexes
|
||||
*/
|
||||
private $requestMap = array();
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* set timeout
|
||||
*
|
||||
* @param init $timeout
|
||||
* @return
|
||||
*/
|
||||
public function set_timeout($timeout)
|
||||
{
|
||||
$this->options[CURLOPT_TIMEOUT] = $timeout;
|
||||
}
|
||||
|
||||
/**
|
||||
* set proxy
|
||||
*
|
||||
*/
|
||||
public function set_proxy($proxy)
|
||||
{
|
||||
$this->options[CURLOPT_PROXY] = $proxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* set referer
|
||||
*
|
||||
*/
|
||||
public function set_referer($referer)
|
||||
{
|
||||
$this->options[CURLOPT_REFERER] = $referer;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置 user_agent
|
||||
*
|
||||
* @param string $useragent
|
||||
* @return void
|
||||
*/
|
||||
public function set_useragent($useragent)
|
||||
{
|
||||
$this->options[CURLOPT_USERAGENT] = $useragent;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE
|
||||
*
|
||||
* @param string $cookie
|
||||
* @return void
|
||||
*/
|
||||
public function set_cookie($cookie)
|
||||
{
|
||||
$this->options[CURLOPT_COOKIE] = $cookie;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE JAR
|
||||
*
|
||||
* @param string $cookie_jar
|
||||
* @return void
|
||||
*/
|
||||
public function set_cookiejar($cookiejar)
|
||||
{
|
||||
$this->options[CURLOPT_COOKIEJAR] = $cookiejar;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE FILE
|
||||
*
|
||||
* @param string $cookie_file
|
||||
* @return void
|
||||
*/
|
||||
public function set_cookiefile($cookiefile)
|
||||
{
|
||||
$this->options[CURLOPT_COOKIEFILE] = $cookiefile;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取内容的时候是不是连header也一起获取
|
||||
*
|
||||
* @param mixed $http_raw
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public function set_http_raw($http_raw = false)
|
||||
{
|
||||
$this->options[CURLOPT_HEADER] = $http_raw;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置IP
|
||||
*
|
||||
* @param string $ip
|
||||
* @return void
|
||||
*/
|
||||
public function set_ip($ip)
|
||||
{
|
||||
$headers = array(
|
||||
'CLIENT-IP'=>$ip,
|
||||
'X-FORWARDED-FOR'=>$ip,
|
||||
);
|
||||
$this->headers = $this->headers + $headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Headers
|
||||
*
|
||||
* @param string $headers
|
||||
* @return void
|
||||
*/
|
||||
public function set_headers($headers)
|
||||
{
|
||||
$this->headers = $this->headers + $headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Hosts
|
||||
*
|
||||
* @param string $hosts
|
||||
* @return void
|
||||
*/
|
||||
public function set_hosts($hosts)
|
||||
{
|
||||
$headers = array(
|
||||
'Host'=>$hosts,
|
||||
);
|
||||
$this->headers = $this->headers + $headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Gzip
|
||||
*
|
||||
* @param string $hosts
|
||||
* @return void
|
||||
*/
|
||||
public function set_gzip($gzip)
|
||||
{
|
||||
if ($gzip)
|
||||
{
|
||||
$this->options[CURLOPT_ENCODING] = 'gzip';
|
||||
}
|
||||
}
|
||||
|
||||
public function request($url, $method = "GET", $fields = array(), $headers = array(), $options = array())
|
||||
{
|
||||
$this->requests[] = array('url'=>$url,'method'=>$method,'fields'=>$fields,'headers'=>$headers,'options'=>$options);
|
||||
return true;
|
||||
}
|
||||
|
||||
public function get_options($request)
|
||||
{
|
||||
$options = $this->options;
|
||||
$headers = $this->headers;
|
||||
|
||||
if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode'))
|
||||
{
|
||||
$options[CURLOPT_FOLLOWLOCATION] = 1;
|
||||
$options[CURLOPT_MAXREDIRS] = 5;
|
||||
}
|
||||
|
||||
// 如果是 get 方式,直接拼凑一个 url 出来
|
||||
if (strtolower($request['method']) == 'get' && !empty($request['fields']))
|
||||
{
|
||||
$url = $request['url'] . "?" . http_build_query($request['fields']);
|
||||
}
|
||||
// 如果是 post 方式
|
||||
if (strtolower($request['method']) == 'post')
|
||||
{
|
||||
$options[CURLOPT_POST] = 1;
|
||||
$options[CURLOPT_POSTFIELDS] = $request['fields'];
|
||||
}
|
||||
|
||||
// append custom options for this specific request
|
||||
if ($request['options'])
|
||||
{
|
||||
$options = $request['options'] + $options;
|
||||
}
|
||||
|
||||
if ($request['headers'])
|
||||
{
|
||||
$headers = $request['headers'] + $headers;
|
||||
}
|
||||
|
||||
// 随机绑定 hosts,做负载均衡
|
||||
//if (self::$hosts)
|
||||
//{
|
||||
//$parse_url = parse_url($url);
|
||||
//$host = $parse_url['host'];
|
||||
//$key = rand(0, count(self::$hosts)-1);
|
||||
//$ip = self::$hosts[$key];
|
||||
//$url = str_replace($host, $ip, $url);
|
||||
//self::$headers = array_merge( array('Host:'.$host), self::$headers );
|
||||
//}
|
||||
|
||||
// header 要这样拼凑
|
||||
$headers_tmp = array();
|
||||
foreach ($headers as $k=>$v)
|
||||
{
|
||||
$headers_tmp[] = $k.":".$v;
|
||||
}
|
||||
$headers = $headers_tmp;
|
||||
|
||||
$options[CURLOPT_URL] = $request['url'];
|
||||
$options[CURLOPT_HTTPHEADER] = $headers;
|
||||
|
||||
return $options;
|
||||
}
|
||||
|
||||
/**
|
||||
* GET 请求
|
||||
*
|
||||
* @param string $url
|
||||
* @param array $headers
|
||||
* @param array $options
|
||||
* @return bool
|
||||
*/
|
||||
public function get($url, $fields = array(), $headers = array(), $options = array())
|
||||
{
|
||||
return $this->request($url, 'get', $fields, $headers, $options);
|
||||
}
|
||||
|
||||
/**
|
||||
* $fields 有三种类型:1、数组;2、http query;3、json
|
||||
* 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao'))
|
||||
* 前两种是普通的post,可以用$_POST方式获取
|
||||
* 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
|
||||
*
|
||||
* @param string $url
|
||||
* @param array $fields
|
||||
* @param array $headers
|
||||
* @param array $options
|
||||
* @return void
|
||||
*/
|
||||
public function post($url, $fields = array(), $headers = array(), $options = array())
|
||||
{
|
||||
return $this->request($url, 'post', $fields, $headers, $options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute processing
|
||||
*
|
||||
* @param int $window_size Max number of simultaneous connections
|
||||
* @return string|bool
|
||||
*/
|
||||
public function execute($window_size = null)
|
||||
{
|
||||
$count = sizeof($this->requests);
|
||||
if ($count == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// 只有一个请求
|
||||
elseif ($count == 1)
|
||||
{
|
||||
return $this->single_curl();
|
||||
}
|
||||
else
|
||||
{
|
||||
// 开始 rolling curl,window_size 是最大同时连接数
|
||||
return $this->rolling_curl($window_size);
|
||||
}
|
||||
}
|
||||
|
||||
private function single_curl()
|
||||
{
|
||||
$ch = curl_init();
|
||||
// 从请求队列里面弹出一个来
|
||||
$request = array_shift($this->requests);
|
||||
$options = $this->get_options($request);
|
||||
curl_setopt_array($ch, $options);
|
||||
$output = curl_exec($ch);
|
||||
$info = curl_getinfo($ch);
|
||||
$error = null;
|
||||
if ($output === false)
|
||||
{
|
||||
$error = curl_error( $ch );
|
||||
}
|
||||
//$output = substr($output, 10);
|
||||
//$output = gzinflate($output);
|
||||
|
||||
// 其实一个请求的时候没是么必要回调,直接返回数据就好了,不过这里算是多一个功能吧,和多请求保持一样的操作
|
||||
if ($this->callback)
|
||||
{
|
||||
if (is_callable($this->callback))
|
||||
{
|
||||
call_user_func($this->callback, $output, $info, $request, $error);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return $output;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private function rolling_curl($window_size = null)
|
||||
{
|
||||
// 如何设置了最大任务数
|
||||
if ($window_size)
|
||||
$this->window_size = $window_size;
|
||||
|
||||
// 如果请求数 小于 任务数,设置任务数为请求数
|
||||
if (sizeof($this->requests) < $this->window_size)
|
||||
$this->window_size = sizeof($this->requests);
|
||||
|
||||
// 如果任务数小于2个,不应该用这个方法的,用上面的single_curl方法就好了
|
||||
if ($this->window_size < 2)
|
||||
exit("Window size must be greater than 1");
|
||||
|
||||
// 初始化任务队列
|
||||
$master = curl_multi_init();
|
||||
|
||||
// 开始第一批请求
|
||||
for ($i = 0; $i < $this->window_size; $i++)
|
||||
{
|
||||
$ch = curl_init();
|
||||
$options = $this->get_options($this->requests[$i]);
|
||||
curl_setopt_array($ch, $options);
|
||||
curl_multi_add_handle($master, $ch);
|
||||
// 添加到请求数组
|
||||
$key = (string) $ch;
|
||||
$this->requestMap[$key] = $i;
|
||||
}
|
||||
|
||||
do {
|
||||
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ;
|
||||
|
||||
// 如果
|
||||
if ($execrun != CURLM_OK) { break; }
|
||||
|
||||
// 一旦有一个请求完成,找出来,因为curl底层是select,所以最大受限于1024
|
||||
while ($done = curl_multi_info_read($master))
|
||||
{
|
||||
// 从请求中获取信息、内容、错误
|
||||
$info = curl_getinfo($done['handle']);
|
||||
$output = curl_multi_getcontent($done['handle']);
|
||||
$error = curl_error($done['handle']);
|
||||
|
||||
// 如果绑定了回调函数
|
||||
$callback = $this->callback;
|
||||
if (is_callable($callback))
|
||||
{
|
||||
$key = (string) $done['handle'];
|
||||
$request = $this->requests[$this->requestMap[$key]];
|
||||
unset($this->requestMap[$key]);
|
||||
call_user_func($callback, $output, $info, $request, $error);
|
||||
}
|
||||
|
||||
// 一个请求完了,就加一个进来,一直保证5个任务同时进行
|
||||
if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests))
|
||||
{
|
||||
$ch = curl_init();
|
||||
$options = $this->get_options($this->requests[$i]);
|
||||
curl_setopt_array($ch, $options);
|
||||
curl_multi_add_handle($master, $ch);
|
||||
|
||||
// 添加到请求数组
|
||||
$key = (string) $ch;
|
||||
$this->requestMap[$key] = $i;
|
||||
$i++;
|
||||
}
|
||||
// 把请求已经完成了得 curl handle 删除
|
||||
curl_multi_remove_handle($master, $done['handle']);
|
||||
}
|
||||
|
||||
// 当没有数据的时候进行堵塞,把 CPU 使用权交出来,避免上面 do 死循环空跑数据导致 CPU 100%
|
||||
if ($running)
|
||||
{
|
||||
curl_multi_select($master, $this->timeout);
|
||||
}
|
||||
|
||||
} while ($running);
|
||||
// 关闭任务
|
||||
curl_multi_close($master);
|
||||
|
||||
// 把请求清空,否则没有重新 new rolling_curl(); 直接再次导入一批url的时候,就会把前面已经执行过的url又执行一轮
|
||||
unset($this->requests);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
public function __destruct()
|
||||
{
|
||||
unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests);
|
||||
}
|
||||
}
|
7
vendor/owner888/phpspider/test.php
vendored
Normal file
7
vendor/owner888/phpspider/test.php
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
<?php
|
||||
|
||||
$arr = array('fff', 'ggg', '', '');
|
||||
$arr = array_filter($arr);
|
||||
print_r($arr);
|
||||
|
||||
|
32
vendor/owner888/phpspider/worker.php
vendored
Normal file
32
vendor/owner888/phpspider/worker.php
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
<?php
|
||||
|
||||
echo "Starting\n";
|
||||
|
||||
$gmworker = new GearmanWorker();
|
||||
$gmworker->addServer('10.10.10.238');
|
||||
$gmworker->addFunction("reverse", "reverse_fn");
|
||||
|
||||
print "Waiting for job...\n";
|
||||
while($gmworker->work())
|
||||
{
|
||||
if ($gmworker->returnCode() != GEARMAN_SUCCESS)
|
||||
{
|
||||
echo "return_code: " . $gmworker->returnCode() . "\n";
|
||||
break;
|
||||
}
|
||||
//break;
|
||||
}
|
||||
|
||||
function reverse_fn($job)
|
||||
{
|
||||
sleep(3);
|
||||
echo $job->workload()."\n";
|
||||
return strrev($job->workload());
|
||||
}
|
||||
|
||||
|
||||
echo "hello\n";
|
||||
?>
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user