IYUU自动辅种工具初始化版本

This commit is contained in:
iyuu.cn
2019-12-22 00:00:00 +08:00
commit e81d66869a
133 changed files with 33246 additions and 0 deletions

52
vendor/owner888/phpspider/README.md vendored Normal file
View File

@ -0,0 +1,52 @@
# phpspider -- PHP蜘蛛爬虫框架
《我用爬虫一天时间“偷了”知乎一百万用户只为证明PHP是世界上最好的语言 》所使用的程序
phpspider是一个爬虫开发框架。使用本框架你不用了解爬虫的底层技术实现爬虫被网站屏蔽、有些网站需要登录或验证码识别才能爬取等问题。简单几行PHP代码就可以创建自己的爬虫利用框架封装的多进程Worker类库代码更简洁执行效率更高速度更快。
demo目录下有一些特定网站的爬取规则只要你安装了PHP环境代码就可以在命令行下直接跑。 对爬虫感兴趣的开发者可以加QQ群一起讨论147824717。
下面以糗事百科为例, 来看一下我们的爬虫长什么样子:
```
$configs = array(
'name' => '糗事百科',
'domains' => array(
'qiushibaike.com',
'www.qiushibaike.com'
),
'scan_urls' => array(
'http://www.qiushibaike.com/'
),
'content_url_regexes' => array(
"http://www.qiushibaike.com/article/\d+"
),
'list_url_regexes' => array(
"http://www.qiushibaike.com/8hr/page/\d+\?s=\d+"
),
'fields' => array(
array(
// 抽取内容页的文章内容
'name' => "article_content",
'selector' => "//*[@id='single-next-link']",
'required' => true
),
array(
// 抽取内容页的文章作者
'name' => "article_author",
'selector' => "//div[contains(@class,'author')]//h2",
'required' => true
),
),
);
$spider = new phpspider($configs);
$spider->start();
```
爬虫的整体框架就是这样, 首先定义了一个$configs数组, 里面设置了待爬网站的一些信息, 然后通过调用```$spider = new phpspider($configs);```和```$spider->start();```来配置并启动爬虫.
#### 运行界面如下:
![](http://www.epooll.com/zhihu/pachong.gif)
更多详细内容,移步到:
[开发文档](http://doc.phpspider.org)

View File

@ -0,0 +1,77 @@
<?php
/**
* This file is part of phpspider.
*
* Licensed under The MIT License
* For full copyright and license information, please see the MIT-LICENSE.txt
* Redistributions of files must retain the above copyright notice.
*
* @author seatle<seatle@foxmail.com>
* @copyright seatle<seatle@foxmail.com>
* @link http://www.phpspider.org/
* @license http://www.opensource.org/licenses/mit-license.php MIT License
*/
namespace phpspider;
/**
* autoloader.
*/
class autoloader
{
/**
* Autoload root path.
*
* @var string
*/
protected static $_autoload_root_path = '';
/**
* Set autoload root path.
*
* @param string $root_path
* @return void
*/
public static function set_root_path($root_path)
{
self::$_autoload_root_path = $root_path;
}
/**
* Load files by namespace.
*
* @param string $name
* @return boolean
*/
public static function load_by_namespace($name)
{
$class_path = str_replace('\\', DIRECTORY_SEPARATOR, $name);
if (strpos($name, 'phpspider\\') === 0)
{
$class_file = __DIR__ . substr($class_path, strlen('phpspider')) . '.php';
}
else
{
if (self::$_autoload_root_path)
{
$class_file = self::$_autoload_root_path . DIRECTORY_SEPARATOR . $class_path . '.php';
}
if (empty($class_file) || !is_file($class_file))
{
$class_file = __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . "$class_path.php";
}
}
if (is_file($class_file))
{
require_once($class_file);
if (class_exists($name, false))
{
return true;
}
}
return false;
}
}
spl_autoload_register('\phpspider\autoloader::load_by_namespace');

38
vendor/owner888/phpspider/composer.json vendored Normal file
View File

@ -0,0 +1,38 @@
{
"name": "owner888/phpspider",
"type": "library",
"keywords": [
"framework",
"phpspider"
],
"homepage": "http://www.phpspider.org",
"license": "MIT",
"description": "The PHPSpider Framework.",
"authors": [
{
"name": "Seatle Yang",
"email": "seatle@foxmail.com",
"homepage": "http://www.phpspider.org",
"role": "Developer"
}
],
"support": {
"email": "seatle@foxmail.com",
"issues": "https://github.com/owner888/phpspider/issues",
"forum": "http://wenda.phpspider.org/",
"wiki": "http://doc.phpspider.org/",
"source": "https://github.com/owner888/phpspider"
},
"require": {
"php": ">=5.5.0"
},
"suggest": {
"ext-pcntl、ext-redis": "For better performance. "
},
"autoload": {
"psr-4": {
"phpspider\\": "./"
}
},
"minimum-stability": "dev"
}

View File

@ -0,0 +1,64 @@
<?php
// +----------------------------------------------------------------------
// | PHPSpider [ A PHP Framework For Crawler ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: Seatle Yang <seatle@foxmail.com>
// +----------------------------------------------------------------------
//----------------------------------
// PHPSpider缓存类文件
//----------------------------------
class cache
{
// 多进程下面不能用单例模式
//protected static $_instance;
/**
* 获取实例
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-04-10 22:55
*/
public static function init()
{
if(extension_loaded('Redis'))
{
$_instance = new Redis();
}
else
{
$errmsg = "extension redis is not installed";
log::add($errmsg, "Error");
return null;
}
// 这里不能用pconnect会报错Uncaught exception 'RedisException' with message 'read error on connection'
$_instance->connect($GLOBALS['config']['redis']['host'], $GLOBALS['config']['redis']['port'], $GLOBALS['config']['redis']['timeout']);
// 验证
if ($GLOBALS['config']['redis']['pass'])
{
if ( !$_instance->auth($GLOBALS['config']['redis']['pass']) )
{
$errmsg = "Redis Server authentication failed!!";
log::add($errmsg, "Error");
return null;
}
}
// 不序列化的话不能存数组用php的序列化方式其他语言又不能读取所以这里自己用json序列化了性能还比php的序列化好1.4倍
//$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_NONE); // don't serialize data
//$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_PHP); // use built-in serialize/unserialize
//$_instance->setOption(Redis::OPT_SERIALIZER, Redis::SERIALIZER_IGBINARY); // use igBinary serialize/unserialize
$_instance->setOption(Redis::OPT_PREFIX, $GLOBALS['config']['redis']['prefix'] . ":");
return $_instance;
}
}

View File

@ -0,0 +1,55 @@
<?php
// +----------------------------------------------------------------------
// | PHPSpider [ A PHP Framework For Crawler ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: Seatle Yang <seatle@foxmail.com>
// +----------------------------------------------------------------------
//----------------------------------
// PHPSpider公共入口文件
//----------------------------------
//namespace phpspider\core;
// Display errors.
ini_set('display_errors', 'on');
// Reporting all.
error_reporting(E_ALL);
// 永不超时
ini_set('max_execution_time', 0);
set_time_limit(0);
// 内存限制,如果外面设置的内存比 /etc/php/php-cli.ini 大,就不要设置了
if (intval(ini_get("memory_limit")) < 1024)
{
ini_set('memory_limit', '1024M');
}
if( PHP_SAPI != 'cli' )
{
exit("You must run the CLI environment\n");
}
// Date.timezone
if (!ini_get('date.timezone'))
{
date_default_timezone_set('Asia/Shanghai');
}
//核心库目录
define('CORE', dirname(__FILE__));
define('PATH_ROOT', CORE."/../");
define('PATH_DATA', CORE."/../data");
define('PATH_LIBRARY', CORE."/../library");
//系统配置
//if( file_exists( PATH_ROOT."/config/inc_config.php" ) )
//{
//require PATH_ROOT."/config/inc_config.php";
//}

579
vendor/owner888/phpspider/core/db.php vendored Normal file
View File

@ -0,0 +1,579 @@
<?php
// +----------------------------------------------------------------------
// | PHPSpider [ A PHP Framework For Crawler ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: Seatle Yang <seatle@foxmail.com>
// +----------------------------------------------------------------------
//----------------------------------
// PHPSpider数据库类文件
//----------------------------------
namespace phpspider\core;
class db
{
private static $configs = array();
private static $rsid;
private static $links = array();
private static $link_name = 'default';
private static $autocommiting = false;
public static function _init()
{
// 获取配置
$config = self::$link_name == 'default' ? self::_get_default_config() : self::$configs[self::$link_name];
// 创建连接
if (empty(self::$links[self::$link_name]) || empty(self::$links[self::$link_name]['conn']))
{
// 第一次连接初始化fail和pid
if (empty(self::$links[self::$link_name]))
{
self::$links[self::$link_name]['fail'] = 0;
self::$links[self::$link_name]['pid'] = function_exists('posix_getpid') ? posix_getpid() : 0;
//echo "progress[".self::$links[self::$link_name]['pid']."] create db connect[".self::$link_name."]\n";
}
self::$links[self::$link_name]['conn'] = mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']);
if(mysqli_connect_errno())
{
self::$links[self::$link_name]['fail']++;
$errmsg = 'Mysql Connect failed['.self::$links[self::$link_name]['fail'].']: ' . mysqli_connect_error();
echo util::colorize(date("H:i:s") . " {$errmsg}\n\n", 'fail');
log::add($errmsg, "Error");
// 连接失败5次中断进程
if (self::$links[self::$link_name]['fail'] >= 5)
{
exit(250);
}
self::_init($config);
}
else
{
mysqli_query(self::$links[self::$link_name]['conn'], " SET character_set_connection=utf8, character_set_results=utf8, character_set_client=binary, sql_mode='' ");
}
}
else
{
$curr_pid = function_exists('posix_getpid') ? posix_getpid() : 0;
// 如果父进程已经生成资源就释放重新生成,因为多进程不能共享连接资源
if (self::$links[self::$link_name]['pid'] != $curr_pid)
{
self::clear_link();
}
}
}
/**
* 重新设置连接
* 传空的话就等于关闭数据库再连接
* 在多进程环境下如果主进程已经调用过了,子进程一定要调用一次 clear_link否则会报错
* Error while reading greeting packet. PID=19615这是两个进程互抢一个连接句柄引起的
*
* @param array $config
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-03-29 00:51
*/
public static function clear_link()
{
if(self::$links)
{
foreach(self::$links as $k=>$v)
{
@mysqli_close($v['conn']);
unset(self::$links[$k]);
}
}
// 注意,只会连接最后一个,不过貌似也够用了啊
self::_init();
}
/**
* 改变链接为指定配置的链接(如果不同时使用多个数据库,不会涉及这个操作)
* @parem $link_name 链接标识名
* @parem $config 多次使用时, 这个数组只需传递一次
* config 格式与 $GLOBALS['config']['db'] 一致
* @return void
*/
public static function set_connect($link_name, $config = array())
{
self::$link_name = $link_name;
if (!empty($config))
{
self::$configs[self::$link_name] = $config;
}
else
{
if (empty(self::$configs[self::$link_name]))
{
throw new Exception("You not set a config array for connect!");
}
}
}
/**
* 还原为默认连接(如果不同时使用多个数据库,不会涉及这个操作)
* @parem $config 指定配置默认使用inc_config.php的配置
* @return void
*/
public static function set_connect_default()
{
$config = self::_get_default_config();
self::set_connect('default', $config);
}
/**
* 获取默认配置
*/
protected static function _get_default_config()
{
if (empty(self::$configs['default']))
{
if (!is_array($GLOBALS['config']['db']))
{
exit('db.php _get_default_config()' . '没有mysql配置');
}
self::$configs['default'] = $GLOBALS['config']['db'];
}
return self::$configs['default'];
}
/**
* 返回查询游标
* @return rsid
*/
protected static function _get_rsid($rsid = '')
{
return $rsid == '' ? self::$rsid : $rsid;
}
public static function autocommit($mode = false)
{
if ( self::$autocommiting )
{
return true;
}
self::$autocommiting = true;
self::_init();
return mysqli_autocommit(self::$links[self::$link_name]['conn'], $mode);
}
public static function begin_tran()
{
return self::autocommit(false);
}
public static function commit()
{
mysqli_commit(self::$links[self::$link_name]['conn']);
self::autocommit(true);
return true;
}
public static function rollback()
{
mysqli_rollback(self::$links[self::$link_name]['conn']);
self::autocommit(true);
return true;
}
public static function query($sql)
{
$sql = trim($sql);
// 初始化数据库
self::_init();
self::$rsid = @mysqli_query(self::$links[self::$link_name]['conn'], $sql);
if (self::$rsid === false)
{
// 不要每次都ping浪费流量浪费性能执行出错了才重新连接
$errno = mysqli_errno(self::$links[self::$link_name]['conn']);
if ($errno == 2013 || $errno == 2006)
{
$errmsg = mysqli_error(self::$links[self::$link_name]['conn']);
log::add($errmsg, "Error");
@mysqli_close(self::$links[self::$link_name]['conn']);
self::$links[self::$link_name]['conn'] = null;
return self::query($sql);
}
$errmsg = "Query SQL: ".$sql;
log::add($errmsg, "Warning");
$errmsg = "Error SQL: ".mysqli_error(self::$links[self::$link_name]['conn']);
log::add($errmsg, "Warning");
$backtrace = debug_backtrace();
array_shift($backtrace);
$narr = array('class', 'type', 'function', 'file', 'line');
$err = "debug_backtrace\n";
foreach($backtrace as $i => $l)
{
foreach($narr as $k)
{
if( !isset($l[$k]) )
{
$l[$k] = '';
}
}
$err .= "[$i] in function {$l['class']}{$l['type']}{$l['function']} ";
if($l['file']) $err .= " in {$l['file']} ";
if($l['line']) $err .= " on line {$l['line']} ";
$err .= "\n";
}
log::add($err);
return false;
}
else
{
return self::$rsid;
}
}
public static function fetch($rsid = '')
{
$rsid = self::_get_rsid($rsid);
$row = mysqli_fetch_array($rsid, MYSQLI_ASSOC);
return $row;
}
public static function get_one($sql)
{
if (!preg_match("/limit/i", $sql))
{
$sql = preg_replace("/[,;]$/i", '', trim($sql)) . " limit 1 ";
}
$rsid = self::query($sql);
if ($rsid === false)
{
return array();
}
$row = self::fetch($rsid);
self::free($rsid);
return $row;
}
public static function get_all($sql)
{
$rsid = self::query($sql);
if ($rsid === false)
{
return array();
}
while ( $row = self::fetch($rsid) )
{
$rows[] = $row;
}
self::free($rsid);
return empty($rows) ? false : $rows;
}
public static function free($rsid)
{
return mysqli_free_result($rsid);
}
public static function insert_id()
{
return mysqli_insert_id(self::$links[self::$link_name]['conn']);
}
public static function affected_rows()
{
return mysqli_affected_rows(self::$links[self::$link_name]['conn']);
}
public static function insert($table = '', $data = null, $return_sql = false)
{
$items_sql = $values_sql = "";
foreach ($data as $k => $v)
{
$v = stripslashes($v);
$v = addslashes($v);
$items_sql .= "`$k`,";
$values_sql .= "\"$v\",";
}
$sql = "Insert Ignore Into `{$table}` (" . substr($items_sql, 0, -1) . ") Values (" . substr($values_sql, 0, -1) . ")";
if ($return_sql)
{
return $sql;
}
else
{
if (self::query($sql))
{
return mysqli_insert_id(self::$links[self::$link_name]['conn']);
}
else
{
return false;
}
}
}
public static function insert_batch($table = '', $set = NULL, $return_sql = FALSE)
{
if (empty($table) || empty($set))
{
return false;
}
$set = self::strsafe($set);
$fields = self::get_fields($table);
$keys_sql = $vals_sql = array();
foreach ($set as $i=>$val)
{
ksort($val);
$vals = array();
foreach ($val as $k => $v)
{
// 过滤掉数据库没有的字段
if (!in_array($k, $fields))
{
continue;
}
// 如果是第一个数组把key当做插入条件
if ($i == 0 && $k == 0)
{
$keys_sql[] = "`$k`";
}
$vals[] = "\"$v\"";
}
$vals_sql[] = implode(",", $vals);
}
$sql = "Insert Ignore Into `{$table}`(".implode(", ", $keys_sql).") Values (".implode("), (", $vals_sql).")";
if ($return_sql) return $sql;
$rt = self::query($sql);
$insert_id = self::insert_id();
$return = empty($insert_id) ? $rt : $insert_id;
return $return;
}
public static function update_batch($table = '', $set = NULL, $index = NULL, $where = NULL, $return_sql = FALSE)
{
if (empty($table) || is_null($set) || is_null($index))
{
// 不要用exit会中断程序
return false;
}
$set = self::strsafe($set);
$fields = self::get_fields($table);
$ids = array();
foreach ($set as $val)
{
ksort($val);
// 去重其实不去也可以因为相同的when只会执行第一个后面的就直接跳过不执行了
$key = md5($val[$index]);
$ids[$key] = $val[$index];
foreach (array_keys($val) as $field)
{
if ($field != $index)
{
$final[$field][$key] = 'When `'.$index.'` = "'.$val[$index].'" Then "'.$val[$field].'"';
}
}
}
//$ids = array_values($ids);
// 如果不是数组而且不为空,就转数组
if (!is_array($where) && !empty($where))
{
$where = array($where);
}
$where[] = $index.' In ("'.implode('","', $ids).'")';
$where = empty($where) ? "" : " Where ".implode(" And ", $where);
$sql = "Update `".$table."` Set ";
$cases = '';
foreach ($final as $k => $v)
{
// 过滤掉数据库没有的字段
if (!in_array($k, $fields))
{
continue;
}
$cases .= '`'.$k.'` = Case '."\n";
foreach ($v as $row)
{
$cases .= $row."\n";
}
$cases .= 'Else `'.$k.'` End, ';
}
$sql .= substr($cases, 0, -2);
// 其实不带 Where In ($index) 的条件也可以的
$sql .= $where;
if ($return_sql) return $sql;
$rt = self::query($sql);
$insert_id = self::affected_rows();
$return = empty($affected_rows) ? $rt : $affected_rows;
return $return;
}
public static function update($table = '', $data = array(), $where = null, $return_sql = false)
{
$sql = "UPDATE `{$table}` SET ";
foreach ($data as $k => $v)
{
$v = stripslashes($v);
$v = addslashes($v);
$sql .= "`{$k}` = \"{$v}\",";
}
if (!is_array($where))
{
$where = array($where);
}
// 删除空字段,不然array("")会成为WHERE
foreach ($where as $k => $v)
{
if (empty($v))
{
unset($where[$k]);
}
}
$where = empty($where) ? "" : " Where " . implode(" And ", $where);
$sql = substr($sql, 0, -1) . $where;
if ($return_sql)
{
return $sql;
}
else
{
if (self::query($sql))
{
return mysqli_affected_rows(self::$links[self::$link_name]['conn']);
}
else
{
return false;
}
}
}
public static function delete($table = '', $where = null, $return_sql = false)
{
// 小心全部被删除了
if (empty($where))
{
return false;
}
$where = 'Where ' . (!is_array($where) ? $where : implode(' And ', $where));
$sql = "Delete From `{$table}` {$where}";
if ($return_sql)
{
return $sql;
}
else
{
if (self::query($sql))
{
return mysqli_affected_rows(self::$links[self::$link_name]['conn']);
}
else
{
return false;
}
}
}
public static function ping()
{
if (!mysqli_ping(self::$links[self::$link_name]['conn']))
{
@mysqli_close(self::$links[self::$link_name]['conn']);
self::$links[self::$link_name]['conn'] = null;
self::_init();
}
}
public static function strsafe($array)
{
$arrays = array();
if(is_array($array)===true)
{
foreach ($array as $key => $val)
{
if(is_array($val)===true)
{
$arrays[$key] = self::strsafe($val);
}
else
{
//先去掉转义,避免下面重复转义了
$val = stripslashes($val);
//进行转义
$val = addslashes($val);
//处理addslashes没法处理的 _ % 字符
//$val = strtr($val, array('_'=>'\_', '%'=>'\%'));
$arrays[$key] = $val;
}
}
return $arrays;
}
else
{
$array = stripslashes($array);
$array = addslashes($array);
//$array = strtr($array, array('_'=>'\_', '%'=>'\%'));
return $array;
}
}
// 这个是给insert、update、insert_batch、update_batch用的
public static function get_fields($table)
{
// $sql = "SHOW COLUMNS FROM $table"; //和下面的语句效果一样
$rows = self::get_all("Desc `{$table}`");
$fields = array();
foreach ($rows as $k => $v)
{
// 过滤自增主键
// if ($v['Key'] != 'PRI')
if ($v['Extra'] != 'auto_increment')
{
$fields[] = $v['Field'];
}
}
return $fields;
}
public static function table_exists($table_name)
{
$sql = "SHOW TABLES LIKE '" . $table_name . "'";
$rsid = self::query($sql);
$table = self::fetch($rsid);
if (empty($table))
{
return false;
}
return true;
}
}

101
vendor/owner888/phpspider/core/init.php vendored Normal file
View File

@ -0,0 +1,101 @@
<?php
// +----------------------------------------------------------------------
// | PHPSpider [ A PHP Framework For Crawler ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: Seatle Yang <seatle@foxmail.com>
// +----------------------------------------------------------------------
//----------------------------------
// PHPSpider公共入口文件
//----------------------------------
// 严格开发模式
error_reporting( E_ALL );
//ini_set('display_errors', 1);
// 永不超时
ini_set('max_execution_time', 0);
set_time_limit(0);
// 内存限制,如果外面设置的内存比 /etc/php/php-cli.ini 大,就不要设置了
if (intval(ini_get("memory_limit")) < 1024)
{
ini_set('memory_limit', '1024M');
}
if( PHP_SAPI != 'cli' )
{
exit("You must run the CLI environment\n");
}
// 设置时区
date_default_timezone_set('Asia/Shanghai');
// 引入PATH_DATA
require_once __DIR__ . '/constants.php';
// 核心库目录
define('CORE', dirname(__FILE__));
define('PATH_ROOT', CORE."/../");
define('PATH_DATA', CORE."/../data");
define('PATH_LIBRARY', CORE."/../library");
// 系统配置
if( file_exists( PATH_ROOT."/config/inc_config.php" ) )
{
require PATH_ROOT."/config/inc_config.php";
}
require CORE.'/log.php';
require CORE.'/requests.php';
require CORE.'/selector.php';
require CORE.'/util.php';
require CORE.'/db.php';
require CORE.'/cache.php';
require CORE."/worker.php";
require CORE."/phpspider.php";
// 启动的时候生成data目录
util::path_exists(PATH_DATA);
util::path_exists(PATH_DATA."/lock");
util::path_exists(PATH_DATA."/log");
util::path_exists(PATH_DATA."/cache");
util::path_exists(PATH_DATA."/status");
function autoload($classname) {
set_include_path(PATH_ROOT.'/library/');
spl_autoload($classname); //replaces include/require
}
spl_autoload_extensions('.php');
spl_autoload_register('autoload');
/**
* 自动加载类库处理
* @return void
*/
//function __autoload( $classname )
//{
//$classname = preg_replace("/[^0-9a-z_]/i", '', $classname);
//if( class_exists ( $classname ) ) {
//return true;
//}
//$classfile = $classname.'.php';
//try
//{
//if ( file_exists ( PATH_LIBRARY.'/'.$classfile ) )
//{
//require PATH_LIBRARY.'/'.$classfile;
//}
//else
//{
//throw new Exception ( 'Error: Cannot find the '.$classname );
//}
//}
//catch ( Exception $e )
//{
//log::error($e->getMessage().'|'.$classname);
//exit();
//}
//}

119
vendor/owner888/phpspider/core/log.php vendored Normal file
View File

@ -0,0 +1,119 @@
<?php
// +----------------------------------------------------------------------
// | PHPSpider [ A PHP Framework For Crawler ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: Seatle Yang <seatle@foxmail.com>
// +----------------------------------------------------------------------
//----------------------------------
// PHPSpider日志类文件
//----------------------------------
namespace phpspider\core;
// 引入PATH_DATA
require_once __DIR__ . '/constants.php';
class log
{
public static $log_show = false;
public static $log_type = false;
public static $log_file = "data/phpspider.log";
public static $out_sta = "";
public static $out_end = "";
public static function note($msg)
{
self::$out_sta = self::$out_end = "";
self::msg($msg, 'note');
}
public static function info($msg)
{
self::$out_sta = self::$out_end = "";
self::msg($msg, 'info');
}
public static function warn($msg)
{
self::$out_sta = self::$out_end = "";
if (!util::is_win())
{
self::$out_sta = "\033[33m";
self::$out_end = "\033[0m";
}
self::msg($msg, 'warn');
}
public static function debug($msg)
{
self::$out_sta = self::$out_end = "";
if (!util::is_win())
{
self::$out_sta = "\033[36m";
self::$out_end = "\033[0m";
}
self::msg($msg, 'debug');
}
public static function error($msg)
{
self::$out_sta = self::$out_end = "";
if (!util::is_win())
{
self::$out_sta = "\033[31m";
self::$out_end = "\033[0m";
}
self::msg($msg, 'error');
}
public static function msg($msg, $log_type)
{
if ($log_type != 'note' && self::$log_type && strpos(self::$log_type, $log_type) === false)
{
return false;
}
if ($log_type == 'note')
{
$msg = self::$out_sta. $msg . "\n".self::$out_end;
}
else
{
$msg = self::$out_sta.date("Y-m-d H:i:s")." [{$log_type}] " . $msg .self::$out_end. "\n";
}
if(self::$log_show)
{
echo $msg;
}
file_put_contents(self::$log_file, $msg, FILE_APPEND | LOCK_EX);
}
/**
* 记录日志 XXX
* @param string $msg
* @param string $log_type Note|Warning|Error
* @return void
*/
public static function add($msg, $log_type = '')
{
if ($log_type != '')
{
$msg = date("Y-m-d H:i:s")." [{$log_type}] " . $msg . "\n";
}
if(self::$log_show)
{
echo $msg;
}
//file_put_contents(PATH_DATA."/log/".strtolower($log_type).".log", $msg, FILE_APPEND | LOCK_EX);
file_put_contents(PATH_DATA."/log/error.log", $msg, FILE_APPEND | LOCK_EX);
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1388
vendor/owner888/phpspider/core/queue.php vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,998 @@
<?php
// +----------------------------------------------------------------------
// | PHPSpider [ A PHP Framework For Crawler ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: Seatle Yang <seatle@foxmail.com>
// +----------------------------------------------------------------------
// +----------------------------------------------------------------------
// | GET请求
// | requests::get('http://www.test.com');
// | SERVER
// | $_GET
// +----------------------------------------------------------------------
// | POST请求
// | $data = array('name'=>'request');
// | requests::post('http://www.test.com', $data);
// | SERVER
// | $_POST
// +----------------------------------------------------------------------
// | POST RESTful请求
// | $data = array('name'=>'request');
// | $data_string = json_encode($data);
// | requests::set_header("Content-Type", "application/json");
// | requests::post('http://www.test.com', $data_string);
// | SERVER
// | file_get_contents('php://input')
// +----------------------------------------------------------------------
// | POST 文件上传
// | $data = array('file1'=>''./data/phpspider.log'');
// | requests::post('http://www.test.com', null, $data);
// | SERVER
// | $_FILES
// +----------------------------------------------------------------------
// | 代理
// | requests::set_proxy(array('223.153.69.150:42354'));
// | $html = requests::get('https://www.test.com');
// +----------------------------------------------------------------------
//----------------------------------
// PHPSpider请求类文件
//----------------------------------
namespace phpspider\core;
if (!function_exists('curl_file_create'))
{
function curl_file_create($filename, $mimetype = '', $postname = '')
{
return "@$filename;filename="
. ($postname ?: basename($filename))
. ($mimetype ? ";type=$mimetype" : '');
}
}
class requests
{
const VERSION = '2.0.1';
protected static $ch = null;
/**** Public variables ****/
/* user definable vars */
public static $timeout = 15;
public static $encoding = null;
public static $input_encoding = null;
public static $output_encoding = null;
public static $cookies = array(); // array of cookies to pass
// $cookies['username'] = "seatle";
public static $rawheaders = array(); // array of raw headers to send
public static $domain_cookies = array(); // array of cookies for domain to pass
public static $hosts = array(); // random host binding for make request faster
public static $headers = array(); // headers returned from server sent here
public static $useragents = array("requests/2.0.0"); // random agent we masquerade as
public static $client_ips = array(); // random ip we masquerade as
public static $proxies = array(); // random proxy ip
public static $raw = ""; // head + body content returned from server sent here
public static $head = ""; // head content
public static $content = ""; // The body before encoding
public static $text = ""; // The body after encoding
public static $info = array(); // curl info
public static $history = 302; // http request status before redirect. ex:30x
public static $status_code = 0; // http request status
public static $error = ""; // error messages sent here
/**
* set timeout
* $timeout 为数组时会分别设置connect和read
*
* @param init or array $timeout
* @return
*/
public static function set_timeout($timeout)
{
self::$timeout = $timeout;
}
/**
* 设置代理
* 如果代理有多个,请求时会随机使用
*
* @param mixed $proxies
* array (
* 'socks5://user1:pass2@host:port',
* 'socks5://user2:pass2@host:port'
*)
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function set_proxy($proxy)
{
self::$proxies = is_array($proxy) ? $proxy : array($proxy);
}
/**
* 删除代理
* 因为每个链接信息里面都有代理信息,有的链接需要,有的不需要,所以必须提供一个删除功能
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2018-07-16 17:59
*/
public static function del_proxy()
{
self::$proxies = array();
}
/**
* 自定义请求头部
* 请求头内容可以用 requests::$rawheaders 来获取
* 比如获取Content-Typerequests::$rawheaders['Content-Type']
*
* @param string $headers
* @return void
*/
public static function set_header($key, $value)
{
self::$rawheaders[$key] = $value;
}
/**
* 设置全局COOKIE
*
* @param string $cookie
* @return void
*/
public static function set_cookie($key, $value, $domain = '')
{
if (empty($key))
{
return false;
}
if (!empty($domain))
{
self::$domain_cookies[$domain][$key] = $value;
}
else
{
self::$cookies[$key] = $value;
}
return true;
}
/**
* 批量设置全局cookie
*
* @param mixed $cookies
* @param string $domain
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2017-08-03 18:06
*/
public static function set_cookies($cookies, $domain = '')
{
$cookies_arr = explode(';', $cookies);
if (empty($cookies_arr))
{
return false;
}
foreach ($cookies_arr as $cookie)
{
$cookie_arr = explode('=', $cookie, 2);
$key = $cookie_arr[0];
$value = empty($cookie_arr[1]) ? '' : $cookie_arr[1];
if (!empty($domain))
{
self::$domain_cookies[$domain][$key] = $value;
}
else
{
self::$cookies[$key] = $value;
}
}
return true;
}
/**
* 获取单一Cookie
*
* @param mixed $name cookie名称
* @param string $domain 不传则取全局cookie就是手动set_cookie的cookie
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2017-08-03 18:06
*/
public static function get_cookie($name, $domain = '')
{
if (!empty($domain) && !isset(self::$domain_cookies[$domain]))
{
return '';
}
$cookies = empty($domain) ? self::$cookies : self::$domain_cookies[$domain];
return isset($cookies[$name]) ? $cookies[$name] : '';
}
/**
* 获取Cookie数组
*
* @param string $domain 不传则取全局cookie就是手动set_cookie的cookie
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2017-08-03 18:06
*/
public static function get_cookies($domain = '')
{
if (!empty($domain) && !isset(self::$domain_cookies[$domain]))
{
return array();
}
return empty($domain) ? self::$cookies : self::$domain_cookies[$domain];
}
/**
* 删除Cookie
*
* @param string $domain 不传则删除全局Cookie
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2017-08-03 18:06
*/
public static function del_cookie($key, $domain = '')
{
if (empty($key))
{
return false;
}
if (!empty($domain) && !isset(self::$domain_cookies[$domain]))
{
return false;
}
if (!empty($domain))
{
if (isset(self::$domain_cookies[$domain][$key]))
{
unset(self::$domain_cookies[$domain][$key]);
}
}
else
{
if (isset(self::$cookies[$key]))
{
unset(self::$cookies[$key]);
}
}
return true;
}
/**
* 删除Cookie
*
* @param string $domain 不传则删除全局Cookie
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2017-08-03 18:06
*/
public static function del_cookies($domain = '')
{
if (!empty($domain) && !isset(self::$domain_cookies[$domain]))
{
return false;
}
if ( empty($domain) )
{
self::$cookies = array();
}
else
{
if (isset(self::$domain_cookies[$domain]))
{
unset(self::$domain_cookies[$domain]);
}
}
return true;
}
/**
* 设置随机的user_agent
*
* @param string $useragent
* @return void
*/
public static function set_useragent($useragent)
{
self::$useragents = is_array($useragent) ? $useragent : array($useragent);
}
/**
* set referer
*
*/
public static function set_referer($referer)
{
self::$rawheaders['Referer'] = $referer;
}
/**
* 设置伪造IP
* 传入数组则为随机IP
* @param string $ip
* @return void
*/
public static function set_client_ip($ip)
{
self::$client_ips = is_array($ip) ? $ip : array($ip);
}
/**
* 删除伪造IP
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2018-07-16 17:59
*/
public static function del_client_ip()
{
self::$client_ips = array();
}
/**
* 设置中文请求
*
* @param string $lang
* @return void
*/
public static function set_accept_language($lang = 'zh-CN')
{
self::$rawheaders['Accept-Language'] = $lang;
}
/**
* 设置Hosts
* 负载均衡到不同的服务器如果对方使用CDN采用这个是最好的了
*
* @param string $hosts
* @return void
*/
public static function set_hosts($host, $ips = array())
{
$ips = is_array($ips) ? $ips : array($ips);
self::$hosts[$host] = $ips;
}
/**
* 分割返回的header和body
* header用来判断编码和获取Cookie
* body用来判断编码得到编码前和编码后的内容
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2017-08-03 18:06
*/
public static function split_header_body()
{
$head = $body = '';
$head = substr(self::$raw, 0, self::$info['header_size']);
$body = substr(self::$raw, self::$info['header_size']);
// http header
self::$head = $head;
// The body before encoding
self::$content = $body;
//$http_headers = array();
//// 解析HTTP数据流
//if (!empty(self::$raw))
//{
//self::get_response_cookies($domain);
//// body里面可能有 \r\n\r\n但是第一个一定是HTTP Header去掉后剩下的就是body
//$array = explode("\r\n\r\n", self::$raw);
//foreach ($array as $k=>$v)
//{
//// post 方法会有两个http headerHTTP/1.1 100 Continue、HTTP/1.1 200 OK
//if (preg_match("#^HTTP/.*? 100 Continue#", $v))
//{
//unset($array[$k]);
//continue;
//}
//if (preg_match("#^HTTP/.*? \d+ #", $v))
//{
//$header = $v;
//unset($array[$k]);
//$http_headers = self::get_response_headers($v);
//}
//}
//$body = implode("\r\n\r\n", $array);
//}
// 设置了输出编码的转码,注意: xpath只支持utf-8iso-8859-1 不要转他本身就是utf-8
$body = self::encoding($body); //自动转码
// 转码后
self::$encoding = self::$output_encoding;
// The body after encoding
self::$text = $body;
return array($head, $body);
}
/**
* 获得域名相对应的Cookie
*
* @param mixed $header
* @param mixed $domain
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2017-08-03 18:06
*/
public static function get_response_cookies($header, $domain)
{
// 解析Cookie并存入 self::$cookies 方便调用
preg_match_all("/.*?Set\-Cookie: ([^\r\n]*)/i", $header, $matches);
$cookies = empty($matches[1]) ? array() : $matches[1];
// 解析到Cookie
if (!empty($cookies))
{
$cookies = implode(';', $cookies);
$cookies = explode(';', $cookies);
foreach ($cookies as $cookie)
{
$cookie_arr = explode('=', $cookie, 2);
// 过滤 httponly、secure
if (count($cookie_arr) < 2)
{
continue;
}
$cookie_name = !empty($cookie_arr[0]) ? trim($cookie_arr[0]) : '';
if (empty($cookie_name))
{
continue;
}
// 过滤掉domain路径
if (in_array(strtolower($cookie_name), array('path', 'domain', 'expires', 'max-age')))
{
continue;
}
self::$domain_cookies[$domain][trim($cookie_arr[0])] = trim($cookie_arr[1]);
}
}
}
/**
* 获得response header
* 此方法占时没有用到
*
* @param mixed $header
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2017-08-03 18:06
*/
public static function get_response_headers($header)
{
$headers = array();
$header_lines = explode("\n", $header);
if (!empty($header_lines))
{
foreach ($header_lines as $line)
{
$header_arr = explode(':', $line, 2);
$key = empty($header_arr[0]) ? '' : trim($header_arr[0]);
$val = empty($header_arr[1]) ? '' : trim($header_arr[1]);
if (empty($key) || empty($val))
{
continue;
}
$headers[$key] = $val;
}
}
self::$headers = $headers;
return self::$headers;
}
/**
* 获取编码
* @param $string
* @return string
*/
public static function get_encoding($string)
{
$encoding = mb_detect_encoding($string, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5', 'ISO-8859-1'));
return strtolower($encoding);
}
/**
* 移除页面head区域代码
* @param $html
* @return mixed
*/
private static function _remove_head($html)
{
return preg_replace('/<head.+?>.+<\/head>/is', '<head></head>', $html);
}
/**
* 简单的判断一下参数是否为一个URL链接
* @param string $str
* @return boolean
*/
private static function _is_url($url)
{
//$pattern = '/^http(s)?:\\/\\/.+/';
$pattern = "/\b(([\w-]+:\/\/?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|\/)))/";
if (preg_match($pattern, $url))
{
return true;
}
return false;
}
/**
* 初始化 CURL
*
*/
public static function init()
{
if (!is_resource ( self::$ch ))
{
self::$ch = curl_init ();
curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt( self::$ch, CURLOPT_HEADER, false );
curl_setopt( self::$ch, CURLOPT_USERAGENT, "phpspider-requests/".self::VERSION );
// 如果设置了两个时间,就分开设置
if (is_array(self::$timeout))
{
curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout[0] );
curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout[1]);
}
else
{
curl_setopt(self::$ch, CURLOPT_CONNECTTIMEOUT, ceil(self::$timeout / 2));
curl_setopt(self::$ch, CURLOPT_TIMEOUT, self::$timeout);
}
curl_setopt(self::$ch, CURLOPT_MAXREDIRS, 5); //maximum number of redirects allowed
// 在多线程处理场景下使用超时选项时会忽略signals对应的处理函数但是无耐的是还有小概率的crash情况发生
curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true);
}
return self::$ch;
}
/**
* get 请求
*/
public static function get($url, $fields = array(), $allow_redirects = true, $cert = NULL)
{
self::init ();
return self::request($url, 'get', $fields, NULL, $allow_redirects, $cert);
}
/**
* post 请求
* $fields 有三种类型:1、数组2、http query3、json
* 1、array('name'=>'yangzetao')
* 2、http_build_query(array('name'=>'yangzetao'))
* 3、json_encode(array('name'=>'yangzetao'))
* 前两种是普通的post可以用$_POST方式获取
* 第三种是post stream( json rpc其实就是webservice )
* 虽然是post方式但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
*
* @param mixed $url
* @param array $fields
* @param mixed $proxies
* @static
* @access public
* @return void
*/
public static function post($url, $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL)
{
self::init ();
return self::request($url, 'POST', $fields, $files, $allow_redirects, $cert);
}
public static function put($url, $fields = array(), $allow_redirects = true, $cert = NULL)
{
self::init ();
return self::request($url, 'PUT', $fields, $allow_redirects, $cert);
}
public static function delete($url, $fields = array(), $allow_redirects = true, $cert = NULL)
{
self::init ();
return self::request($url, 'DELETE', $fields, $allow_redirects, $cert);
}
// 响应HTTP头域里的元信息
// 此方法被用来获取请求实体的元信息而不需要传输实体主体entity-body
// 此方法经常被用来测试超文本链接的有效性,可访问性,和最近的改变。.
public static function head($url, $fields = array(), $allow_redirects = true, $cert = NULL)
{
self::init ();
self::request($url, 'HEAD', $fields, $allow_redirects, $cert);
}
public static function options($url, $fields = array(), $allow_redirects = true, $cert = NULL)
{
self::init ();
return self::request($url, 'OPTIONS', $fields, $allow_redirects, $cert);
}
public static function patch($url, $fields = array(), $allow_redirects = true, $cert = NULL)
{
self::init ();
return self::request($url, 'PATCH', $fields, $allow_redirects, $cert);
}
/**
* request
*
* @param mixed $url 请求URL
* @param string $method 请求方法
* @param array $fields 表单字段
* @param array $files 上传文件
* @param mixed $cert CA证书
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2017-08-03 18:06
*/
public static function request($url, $method = 'GET', $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL)
{
$method = strtoupper($method);
if(!self::_is_url($url))
{
self::$error = "You have requested URL ({$url}) is not a valid HTTP address";
return false;
}
// 如果是 get 方式,直接拼凑一个 url 出来
if ($method == 'GET' && !empty($fields))
{
$url = $url.(strpos($url, '?') === false ? '?' : '&').http_build_query($fields);
}
$parse_url = parse_url($url);
if (empty($parse_url) || empty($parse_url['host']) || !in_array($parse_url['scheme'], array('http', 'https')))
{
self::$error = "No connection adapters were found for '{$url}'";
return false;
}
$scheme = $parse_url['scheme'];
$domain = $parse_url['host'];
// 随机绑定 hosts做负载均衡
if (self::$hosts)
{
if (isset(self::$hosts[$domain]))
{
$hosts = self::$hosts[$domain];
$key = rand(0, count($hosts)-1);
$ip = $hosts[$key];
$url = str_replace($domain, $ip, $url);
self::$rawheaders['Host'] = $domain;
}
}
curl_setopt( self::$ch, CURLOPT_URL, $url );
if ($method != 'GET')
{
// 如果是 post 方式
if ($method == 'POST')
{
//curl_setopt( self::$ch, CURLOPT_POST, true );
$tmpheaders = array_change_key_case(self::$rawheaders, CASE_LOWER);
// 有些RESTful服务只接受JSON形态的数据
// CURLOPT_POST会把上傳的文件类型设为 multipart/form-data
// 把CURLOPT_POSTFIELDS的内容按multipart/form-data 的形式编码
// CURLOPT_CUSTOMREQUEST可以按指定内容上传
if ( isset($tmpheaders['content-type']) && $tmpheaders['content-type'] == 'application/json' )
{
curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method );
}
else
{
curl_setopt( self::$ch, CURLOPT_POST, true );
}
$file_fields = array();
if (!empty($files))
{
foreach ($files as $postname => $file)
{
$filepath = realpath($file);
// 如果文件不存在
if (!file_exists($filepath))
{
continue;
}
$filename = basename($filepath);
$type = self::get_mimetype($filepath);
$file_fields[$postname] = curl_file_create($filepath, $type, $filename);
// curl -F "name=seatle&file=@/absolute/path/to/image.png" htt://localhost/uploadfile.php
//$cfile = '@'.realpath($filename).";type=".$type.";filename=".$filename;
}
}
}
else
{
self::$rawheaders['X-HTTP-Method-Override'] = $method;
curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method );
}
if ( $method == 'POST' )
{
// 不是上传文件的用http_build_query, 能实现更好的兼容性,更小的请求数据包
if ( empty($file_fields) )
{
// post方式
if ( is_array($fields) )
{
$fields = http_build_query($fields);
}
}
else
{
// 有post数据
if ( is_array($fields) && !empty($fields) )
{
// 某些server可能会有问题
$fields = array_merge($fields, $file_fields);
}
else
{
$fields = $file_fields;
}
}
// 不能直接传数组不知道是什么Bug会非常慢
curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields );
}
}
$cookies = self::get_cookies();
$domain_cookies = self::get_cookies($domain);
$cookies = array_merge($cookies, $domain_cookies);
// 是否设置了cookie
if (!empty($cookies))
{
foreach ($cookies as $key=>$value)
{
$cookie_arr[] = $key.'='.$value;
}
$cookies = implode('; ', $cookie_arr);
curl_setopt(self::$ch, CURLOPT_COOKIE, $cookies);
}
if (!empty(self::$useragents))
{
$key = rand(0, count(self::$useragents) - 1);
self::$rawheaders['User-Agent'] = self::$useragents[$key];
}
if (!empty(self::$client_ips))
{
$key = rand(0, count(self::$client_ips) - 1);
self::$rawheaders['CLIENT-IP'] = self::$client_ips[$key];
self::$rawheaders['X-FORWARDED-FOR'] = self::$client_ips[$key];
}
if (self::$rawheaders)
{
$http_headers = array();
foreach (self::$rawheaders as $k=>$v)
{
$http_headers[] = $k.': '.$v;
}
curl_setopt( self::$ch, CURLOPT_HTTPHEADER, $http_headers );
}
curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' );
// 关闭验证
if ($scheme == 'https')
{
curl_setopt(self::$ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt(self::$ch, CURLOPT_SSL_VERIFYHOST, false);
}
if (self::$proxies)
{
$key = rand(0, count(self::$proxies) - 1);
$proxy = self::$proxies[$key];
curl_setopt( self::$ch, CURLOPT_PROXY, $proxy );
}
// header + bodyheader 里面有 cookie
curl_setopt( self::$ch, CURLOPT_HEADER, true );
// 请求跳转后的内容
if ($allow_redirects)
{
curl_setopt( self::$ch, CURLOPT_FOLLOWLOCATION, true);
}
self::$raw = curl_exec ( self::$ch );
// 真实url
//$location = curl_getinfo( self::$ch, CURLINFO_EFFECTIVE_URL);
self::$info = curl_getinfo( self::$ch );
//print_r(self::$info);
self::$status_code = self::$info['http_code'];
if (self::$raw === false)
{
self::$error = 'Curl error: ' . curl_error( self::$ch );
//trigger_error(self::$error, E_USER_WARNING);
}
// 关闭句柄
curl_close( self::$ch );
// 请求成功之后才把URL存起来
list($header, $text) = self::split_header_body();
self::$history = self::get_history($header);
self::$headers = self::get_response_headers($header);
self::get_response_cookies($header, $domain);
//$data = substr($data, 10);
//$data = gzinflate($data);
return $text;
}
public static function get_history($header)
{
$status_code = 0;
$lines = explode("\n", $header);
foreach ($lines as $line)
{
$line = trim($line);
if (preg_match("#^HTTP/.*? (\d+) Found#", $line, $out))
{
$status_code = empty($out[1]) ? 0 : intval($out[1]);
}
}
return $status_code;
}
// 获取 mimetype
public static function get_mimetype($filepath)
{
$fp = finfo_open(FILEINFO_MIME);
$mime = finfo_file($fp, $filepath);
finfo_close($fp);
$arr = explode(';', $mime);
$type = empty($arr[0]) ? '' : $arr[0];
return $type;
}
/**
* 拼凑文件和表单
* 占时没有用到
*
* @param mixed $post_fields
* @param mixed $file_fields
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2017-08-03 18:06
*/
public static function get_postfile_form($post_fields, $file_fields)
{
// 构造post数据
$data = '';
$delimiter = '-------------' . uniqid();
// 表单数据
foreach ($post_fields as $name => $content)
{
$data .= '--'.$delimiter."\r\n";
$data .= 'Content-Disposition: form-data; name = "'.$name.'"';
$data .= "\r\n\r\n";
$data .= $content;
$data .= "\r\n";
}
foreach ($file_fields as $input_name => $file)
{
$data .= '--'.$delimiter."\r\n";
$data .= 'Content-Disposition: form-data; name = "'.$input_name.'";'.
' filename="'.$file['filename'].'"'."\r\n";
$data .= "Content-Type: {$file['type']}\r\n";
$data .= "\r\n";
$data .= $file['content'];
$data .= "\r\n";
}
// 结束符
$data .= '--'.$delimiter."--\r\n";
//return array(
//CURLOPT_HTTPHEADER => array(
//'Content-Type:multipart/form-data;boundary=' . $delimiter,
//'Content-Length:' . strlen($data)
//),
//CURLOPT_POST => true,
//CURLOPT_POSTFIELDS => $data,
//);
return array($delimiter, $data);
}
/**
* html encoding transform
*
* @param string $html
* @param string $in
* @param string $out
* @param string $content
* @param string $mode
* auto|iconv|mb_convert_encoding
* @return string
*/
public static function encoding($html, $in = null, $out = null, $mode = 'auto')
{
$valid = array(
'auto',
'iconv',
'mb_convert_encoding',
);
if (isset(self::$output_encoding))
{
$out = self::$output_encoding;
}
if ( ! isset($out))
{
$out = 'UTF-8';
}
if ( ! in_array($mode, $valid))
{
throw new Exception('invalid mode, mode='.$mode);
}
$if = function_exists('mb_convert_encoding');
$if = $if && ($mode == 'auto' || $mode == 'mb_convert_encoding');
if (function_exists('iconv') && ($mode == 'auto' || $mode == 'iconv'))
{
$func = 'iconv';
}
elseif ($if)
{
$func = 'mb_convert_encoding';
}
else
{
throw new Exception('charsetTrans failed, no function');
}
$pattern = '/(<meta[^>]*?charset=([\"\']?))([a-z\d_\-]*)(\2[^>]*?>)/is';
if ( ! isset($in))
{
$n = preg_match($pattern, $html, $in);
if ($n > 0)
{
$in = $in[3];
}
else
{
$in = null;
}
if (empty($in) and function_exists('mb_detect_encoding'))
{
$in = mb_detect_encoding($html, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5', 'ISO-8859-1'));
}
}
if (isset($in))
{
if ($in == 'ISO-8859-1')
{
$in = 'UTF-8';
}
$old = error_reporting(error_reporting() & ~E_NOTICE);
$html = call_user_func($func, $in, $out.'//IGNORE', $html);
error_reporting($old);
$html = preg_replace($pattern, "\\1$out\\4", $html, 1);
}
return $html;
}
}

View File

@ -0,0 +1,588 @@
<?php
// +----------------------------------------------------------------------
// | PHPSpider [ A PHP Framework For Crawler ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: Seatle Yang <seatle@foxmail.com>
// +----------------------------------------------------------------------
//----------------------------------
// PHPSpider选择器类文件
//----------------------------------
namespace phpspider\core;
use phpspider\library\phpquery;
use DOMDocument;
use DOMXpath;
use Exception;
class selector
{
/**
* 版本号
* @var string
*/
const VERSION = '1.0.2';
public static $dom = null;
public static $dom_auth = '';
public static $xpath = null;
public static $error = null;
public static function select($html, $selector, $selector_type = 'xpath')
{
if (empty($html) || empty($selector))
{
return false;
}
$selector_type = strtolower($selector_type);
if ($selector_type == 'xpath')
{
return self::_xpath_select($html, $selector);
}
elseif ($selector_type == 'regex')
{
return self::_regex_select($html, $selector);
}
elseif ($selector_type == 'css')
{
return self::_css_select($html, $selector);
}
}
public static function remove($html, $selector, $selector_type = 'xpath')
{
if (empty($html) || empty($selector))
{
return false;
}
$remove_html = "";
$selector_type = strtolower($selector_type);
if ($selector_type == 'xpath')
{
$remove_html = self::_xpath_select($html, $selector, true);
}
elseif ($selector_type == 'regex')
{
$remove_html = self::_regex_select($html, $selector, true);
}
elseif ($selector_type == 'css')
{
$remove_html = self::_css_select($html, $selector, true);
}
$html = str_replace($remove_html, "", $html);
return $html;
}
/**
* xpath选择器
*
* @param mixed $html
* @param mixed $selector
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-26 12:53
*/
private static function _xpath_select($html, $selector, $remove = false)
{
if (!is_object(self::$dom))
{
self::$dom = new DOMDocument();
}
// 如果加载的不是之前的HTML内容替换一下验证标识
if (self::$dom_auth != md5($html))
{
self::$dom_auth = md5($html);
@self::$dom->loadHTML('<?xml encoding="UTF-8">'.$html);
self::$xpath = new DOMXpath(self::$dom);
}
//libxml_use_internal_errors(true);
//self::$dom->loadHTML('<?xml encoding="UTF-8">'.$html);
//$errors = libxml_get_errors();
//if (!empty($errors))
//{
//print_r($errors);
//exit;
//}
$elements = @self::$xpath->query($selector);
if ($elements === false)
{
self::$error = "the selector in the xpath(\"{$selector}\") syntax errors";
// 不应该返回false因为isset(false)为true更不能通过 !$values 去判断,因为!0为true所以这里只能返回null
//return false;
return null;
}
$result = array();
if (!is_null($elements))
{
foreach ($elements as $element)
{
// 如果是删除操作,取一整块代码
if ($remove)
{
$content = self::$dom->saveXml($element);
}
else
{
$nodeName = $element->nodeName;
$nodeType = $element->nodeType; // 1.Element 2.Attribute 3.Text
//$nodeAttr = $element->getAttribute('src');
//$nodes = util::node_to_array(self::$dom, $element);
//echo $nodes['@src']."\n";
// 如果是img标签直接取src值
if ($nodeType == 1 && in_array($nodeName, array('img')))
{
$content = $element->getAttribute('src');
}
// 如果是标签属性,直接取节点值
elseif ($nodeType == 2 || $nodeType == 3 || $nodeType == 4)
{
$content = $element->nodeValue;
}
else
{
// 保留nodeValue里的html符号给children二次提取
$content = self::$dom->saveXml($element);
//$content = trim(self::$dom->saveHtml($element));
$content = preg_replace(array("#^<{$nodeName}.*>#isU","#</{$nodeName}>$#isU"), array('', ''), $content);
}
}
$result[] = $content;
}
}
if (empty($result))
{
return null;
}
// 如果只有一个元素就直接返回string否则返回数组
return count($result) > 1 ? $result : $result[0];
}
/**
* css选择器
*
* @param mixed $html
* @param mixed $selector
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-26 12:53
*/
private static function _css_select($html, $selector, $remove = false)
{
$selector = self::css_to_xpath($selector);
//echo $selector."\n";
//exit("\n");
return self::_xpath_select($html, $selector, $remove);
// 如果加载的不是之前的HTML内容替换一下验证标识
//if (self::$dom_auth['css'] != md5($html))
//{
//self::$dom_auth['css'] = md5($html);
//phpQuery::loadDocumentHTML($html);
//}
//if ($remove)
//{
//return phpQuery::pq($selector)->remove();
//}
//else
//{
//return phpQuery::pq($selector)->html();
//}
}
/**
* 正则选择器
*
* @param mixed $html
* @param mixed $selector
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-26 12:53
*/
private static function _regex_select($html, $selector, $remove = false)
{
if(@preg_match_all($selector, $html, $out) === false)
{
self::$error = "the selector in the regex(\"{$selector}\") syntax errors";
return null;
}
$count = count($out);
$result = array();
// 一个都没有匹配到
if ($count == 0)
{
return null;
}
// 只匹配一个,就是只有一个 ()
elseif ($count == 2)
{
// 删除的话取匹配到的所有内容
if ($remove)
{
$result = $out[0];
}
else
{
$result = $out[1];
}
}
else
{
for ($i = 1; $i < $count; $i++)
{
// 如果只有一个元素,就直接返回好了
$result[] = count($out[$i]) > 1 ? $out[$i] : $out[$i][0];
}
}
if (empty($result))
{
return null;
}
return count($result) > 1 ? $result : $result[0];
}
public static function find_all($html, $selector)
{
}
public static function css_to_xpath($selectors)
{
$queries = self::parse_selector($selectors);
$delimiter_before = false;
$xquery = '';
foreach($queries as $s)
{
// TAG
$is_tag = preg_match('@^[\w|\||-]+$@', $s) || $s == '*';
if ($is_tag)
{
$xquery .= $s;
}
// ID
else if ($s[0] == '#')
{
if ($delimiter_before)
{
$xquery .= '*';
}
// ID用精确查询
$xquery .= "[@id='".substr($s, 1)."']";
}
// CLASSES
else if ($s[0] == '.')
{
if ($delimiter_before)
{
$xquery .= '*';
}
// CLASS用模糊查询
$xquery .= "[contains(@class,'".substr($s, 1)."')]";
}
// ATTRIBUTES
else if ($s[0] == '[')
{
if ($delimiter_before)
{
$xquery .= '*';
}
// strip side brackets
$attr = trim($s, '][');
// attr with specifed value
if (mb_strpos($s, '='))
{
$value = null;
list($attr, $value) = explode('=', $attr);
$value = trim($value, "'\"");
if (self::is_regexp($attr))
{
// cut regexp character
$attr = substr($attr, 0, -1);
$xquery .= "[@{$attr}]";
}
else
{
$xquery .= "[@{$attr}='{$value}']";
}
}
// attr without specified value
else
{
$xquery .= "[@{$attr}]";
}
}
// ~ General Sibling Selector
else if ($s[0] == '~')
{
}
// + Adjacent sibling selectors
else if ($s[0] == '+')
{
}
// PSEUDO CLASSES
else if ($s[0] == ':')
{
}
// DIRECT DESCENDANDS
else if ($s == '>')
{
$xquery .= '/';
$delimiter_before = 2;
}
// ALL DESCENDANDS
else if ($s == ' ')
{
$xquery .= '//';
$delimiter_before = 2;
}
// ERRORS
else
{
exit("Unrecognized token '$s'");
}
$delimiter_before = $delimiter_before === 2;
}
return $xquery;
}
/**
* @access private
*/
public static function parse_selector($query)
{
$query = trim( preg_replace( '@\s+@', ' ', preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query) ) );
$queries = array();
if ( !$query )
{
return $queries;
}
$special_chars = array('>',' ');
$special_chars_mapping = array();
$strlen = mb_strlen($query);
$class_chars = array('.', '-');
$pseudo_chars = array('-');
$tag_chars = array('*', '|', '-');
// split multibyte string
// http://code.google.com/p/phpquery/issues/detail?id=76
$_query = array();
for ( $i=0; $i<$strlen; $i++ )
{
$_query[] = mb_substr($query, $i, 1);
}
$query = $_query;
// it works, but i dont like it...
$i = 0;
while( $i < $strlen )
{
$c = $query[$i];
$tmp = '';
// TAG
if ( self::is_char($c) || in_array($c, $tag_chars) )
{
while(isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $tag_chars)))
{
$tmp .= $query[$i];
$i++;
}
$queries[] = $tmp;
}
// IDs
else if ( $c == '#' )
{
$i++;
while( isset($query[$i]) && (self::is_char($query[$i]) || $query[$i] == '-') )
{
$tmp .= $query[$i];
$i++;
}
$queries[] = '#'.$tmp;
}
// SPECIAL CHARS
else if ( in_array($c, $special_chars) )
{
$queries[] = $c;
$i++;
// MAPPED SPECIAL MULTICHARS
// } else if ( $c.$query[$i+1] == '//') {
// $return[] = ' ';
// $i = $i+2;
}
// MAPPED SPECIAL CHARS
else if ( isset($special_chars_mapping[$c]))
{
$queries[] = $special_chars_mapping[$c];
$i++;
}
// COMMA
else if ( $c == ',' )
{
$i++;
while( isset($query[$i]) && $query[$i] == ' ')
{
$i++;
}
}
// CLASSES
else if ($c == '.')
{
while( isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $class_chars)))
{
$tmp .= $query[$i];
$i++;
}
$queries[] = $tmp;
}
// ~ General Sibling Selector
else if ($c == '~')
{
$space_allowed = true;
$tmp .= $query[$i++];
while( isset($query[$i])
&& (self::is_char($query[$i])
|| in_array($query[$i], $class_chars)
|| $query[$i] == '*'
|| ($query[$i] == ' ' && $space_allowed)
))
{
if ($query[$i] != ' ')
{
$space_allowed = false;
}
$tmp .= $query[$i];
$i++;
}
$queries[] = $tmp;
}
// + Adjacent sibling selectors
else if ($c == '+')
{
$space_allowed = true;
$tmp .= $query[$i++];
while( isset($query[$i])
&& (self::is_char($query[$i])
|| in_array($query[$i], $class_chars)
|| $query[$i] == '*'
|| ($space_allowed && $query[$i] == ' ')
))
{
if ($query[$i] != ' ')
$space_allowed = false;
$tmp .= $query[$i];
$i++;
}
$queries[] = $tmp;
}
// ATTRS
else if ($c == '[')
{
$stack = 1;
$tmp .= $c;
while( isset($query[++$i]))
{
$tmp .= $query[$i];
if ( $query[$i] == '[')
{
$stack++;
}
else if ( $query[$i] == ']')
{
$stack--;
if (! $stack )
{
break;
}
}
}
$queries[] = $tmp;
$i++;
}
// PSEUDO CLASSES
else if ($c == ':')
{
$stack = 1;
$tmp .= $query[$i++];
while( isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $pseudo_chars)))
{
$tmp .= $query[$i];
$i++;
}
// with arguments ?
if ( isset($query[$i]) && $query[$i] == '(')
{
$tmp .= $query[$i];
$stack = 1;
while( isset($query[++$i]))
{
$tmp .= $query[$i];
if ( $query[$i] == '(')
{
$stack++;
}
else if ( $query[$i] == ')')
{
$stack--;
if (! $stack )
{
break;
}
}
}
$queries[] = $tmp;
$i++;
}
else
{
$queries[] = $tmp;
}
}
else
{
$i++;
}
}
if (isset($queries[0]))
{
if (isset($queries[0][0]) && $queries[0][0] == ':')
{
array_unshift($queries, '*');
}
if ($queries[0] != '>')
{
array_unshift($queries, ' ');
}
}
return $queries;
}
public static function is_char($char)
{
return preg_match('@\w@', $char);
}
/**
* 模糊匹配
* ^ 前缀字符串
* * 包含字符串
* $ 后缀字符串
* @access private
*/
protected static function is_regexp($pattern)
{
return in_array(
$pattern[ mb_strlen($pattern)-1 ],
array('^','*','$')
);
}
}

936
vendor/owner888/phpspider/core/util.php vendored Normal file
View File

@ -0,0 +1,936 @@
<?php
// +----------------------------------------------------------------------
// | PHPSpider [ A PHP Framework For Crawler ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: Seatle Yang <seatle@foxmail.com>
// +----------------------------------------------------------------------
//----------------------------------
// PHPSpider实用函数集合类文件
//----------------------------------
namespace phpspider\core;
// 引入PATH_DATA
require_once __DIR__ . '/constants.php';
class util
{
/**
* 文件锁
* 如果没有锁,就加一把锁并且执行逻辑,然后删除锁
* if (!util::lock('statistics_offer'))
* {
* util::lock('statistics_offer');
* ...
* util::unlock('statistics_offer');
* }
* 否则输出锁存在
* else
* {
* echo "process has been locked\n";
* }
*
* @param mixed $lock_name
* @param int $lock_timeout
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-02-18 14:28
*/
public static function lock($lock_name, $lock_timeout = 600)
{
$lock = util::get_file(PATH_DATA."/lock/{$lock_name}.lock");
if ($lock)
{
$time = time() - $lock;
// 还没到10分钟说明进程还活着
if ($time < $lock_timeout)
{
return true;
}
unlink(PATH_DATA."/lock/{$lock_name}.lock");
}
util::put_file(PATH_DATA."/lock/{$lock_name}.lock", time());
return false;
}
public static function unlock($lock_name)
{
unlink(PATH_DATA."/lock/{$lock_name}.lock");
}
public static function time2second($time, $is_log = true)
{
if(is_numeric($time))
{
$value = array(
"years" => 0, "days" => 0, "hours" => 0,
"minutes" => 0, "seconds" => 0,
);
if($time >= 31556926)
{
$value["years"] = floor($time/31556926);
$time = ($time%31556926);
}
if($time >= 86400)
{
$value["days"] = floor($time/86400);
$time = ($time%86400);
}
if($time >= 3600)
{
$value["hours"] = floor($time/3600);
$time = ($time%3600);
}
if($time >= 60)
{
$value["minutes"] = floor($time/60);
$time = ($time%60);
}
$value["seconds"] = floor($time);
//return (array) $value;
//$t = $value["years"] ."y ". $value["days"] ."d ". $value["hours"] ."h ". $value["minutes"] ."m ".$value["seconds"]."s";
if ($is_log)
{
$t = $value["days"] ."d ". $value["hours"] ."h ". $value["minutes"] ."m ".$value["seconds"]."s";
}
else
{
$t = $value["days"] ." days ". $value["hours"] ." hours ". $value["minutes"] ." minutes";
}
return $t;
}
else
{
return false;
}
}
public static function get_days($day_sta, $day_end = true, $range = 86400)
{
if ($day_end === true) $day_end = date('Y-m-d');
return array_map(function ($time) {
return date('Y-m-d', $time);
}, range(strtotime($day_sta), strtotime($day_end), $range));
}
/**
* 获取文件行数
*
* @param mixed $filepath
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-03-31 21:54
*/
public static function get_file_line($filepath)
{
$line = 0 ;
$fp = fopen($filepath , 'r');
if (!$fp)
{
return 0;
}
//获取文件的一行内容注意需要php5才支持该函数
while( stream_get_line($fp,8192,"\n") ){
$line++;
}
fclose($fp);//关闭文件
return $line;
}
/**
* 获得表数
*
* @param mixed $table_name 表名
* @param mixed $item_value 唯一索引
* @param int $table_num 表数量
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2015-10-22 23:25
*/
public static function get_table_num($item_value, $table_num = 100)
{
//sha1:返回一个40字符长度的16进制数字
$item_value = sha1(strtolower($item_value));
//base_convert:进制建转换下面是把16进制转成10进制方便做除法运算
//str_pad:把字符串填充为指定的长度下面是在左边加0表数量大于100就3位否则2位
$step = $table_num > 100 ? 3 : 2;
$item_value = str_pad(base_convert(substr($item_value, -2), 16, 10) % $table_num, $step, "0", STR_PAD_LEFT);
return $item_value;
}
/**
* 获得表面
*
* @param mixed $table_name 表名
* @param mixed $item_value 唯一索引
* @param int $table_num 表数量
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2015-10-22 23:25
*/
public static function get_table_name($table_name, $item_value, $table_num = 100)
{
//sha1:返回一个40字符长度的16进制数字
$item_value = sha1(strtolower($item_value));
//base_convert:进制建转换下面是把16进制转成10进制方便做除法运算
//str_pad:把字符串填充为指定的长度下面是在左边加0共3位
$step = $table_num > 100 ? 3 : 2;
$item_value = str_pad(base_convert(substr($item_value, -2), 16, 10) % $table_num, $step, "0", STR_PAD_LEFT);
return $table_name."_".$item_value;
}
// 获得当前使用内存
public static function memory_get_usage()
{
$memory = memory_get_usage();
return self::format_bytes($memory);
}
// 获得最高使用内存
public static function memory_get_peak_usage()
{
$memory = memory_get_peak_usage();
return self::format_bytes($memory);
}
// 转换大小单位
public static function format_bytes($size)
{
$unit = array('b', 'kb', 'mb', 'gb', 'tb', 'pb');
return @round($size / pow(1024, ($i = floor(log($size, 1024)))), 2) . ' ' . $unit[$i];
}
/**
* 获取数组大小
*
* @param mixed $arr 数组
* @return string
*/
public static function array_size($arr)
{
ob_start();
print_r($arr);
$mem = ob_get_contents();
ob_end_clean();
$mem = preg_replace("/\n +/", "", $mem);
$mem = strlen($mem);
return self::format_bytes($mem);
}
/**
* 数字随机数
*
* @param int $num
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function rand_num($num = 7)
{
$rand = "";
for ($i = 0; $i < $num; $i ++)
{
$rand .= mt_rand(0, 9);
}
return $rand;
}
/**
* 字母数字混合随机数
*
* @param int $num
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function rand_str($num = 10)
{
$chars = 'abcdefghijklmnopqrstuvwxyz0123456789';
$string = "";
for ($i = 0; $i < $num; $i ++)
{
$string .= substr($chars, rand(0, strlen($chars)), 1);
}
return $string;
}
/**
* 汉字转拼音
*
* @param mixed $str 汉字
* @param int $ishead
* @param int $isclose
* @static
* @access public
* @return string
*/
public static function pinyin($str, $ishead = 0, $isclose = 1)
{
// $str = iconv("utf-8", "gbk//ignore", $str);
$str = mb_convert_encoding($str, "gbk", "utf-8");
global $pinyins;
$restr = '';
$str = trim($str);
$slen = strlen($str);
if ($slen < 2)
{
return $str;
}
if (count($pinyins) == 0)
{
$fp = fopen(PATH_DATA . '/pinyin.dat', 'r');
while (!feof($fp))
{
$line = trim(fgets($fp));
$pinyins[$line[0] . $line[1]] = substr($line, 3, strlen($line) - 3);
}
fclose($fp);
}
for ($i = 0; $i < $slen; $i ++)
{
if (ord($str[$i]) > 0x80)
{
$c = $str[$i] . $str[$i + 1];
$i ++;
if (isset($pinyins[$c]))
{
if ($ishead == 0)
{
$restr .= $pinyins[$c];
}
else
{
$restr .= $pinyins[$c][0];
}
}
else
{
// $restr .= "_";
}
}
else if (preg_match("/[a-z0-9]/i", $str[$i]))
{
$restr .= $str[$i];
}
else
{
// $restr .= "_";
}
}
if ($isclose == 0)
{
unset($pinyins);
}
return $restr;
}
/**
* 生成字母前缀
*
* @param mixed $s0
* @return char
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function letter_first($s0)
{
$firstchar_ord = ord(strtoupper($s0{0}));
if (($firstchar_ord >= 65 and $firstchar_ord <= 91) or ($firstchar_ord >= 48 and $firstchar_ord <= 57)) return $s0{0};
// $s = iconv("utf-8", "gbk//ignore", $s0);
$s = mb_convert_encoding($s0, "gbk", "utf-8");
$asc = ord($s{0}) * 256 + ord($s{1}) - 65536;
if ($asc >= -20319 and $asc <= -20284) return "A";
if ($asc >= -20283 and $asc <= -19776) return "B";
if ($asc >= -19775 and $asc <= -19219) return "C";
if ($asc >= -19218 and $asc <= -18711) return "D";
if ($asc >= -18710 and $asc <= -18527) return "E";
if ($asc >= -18526 and $asc <= -18240) return "F";
if ($asc >= -18239 and $asc <= -17923) return "G";
if ($asc >= -17922 and $asc <= -17418) return "H";
if ($asc >= -17417 and $asc <= -16475) return "J";
if ($asc >= -16474 and $asc <= -16213) return "K";
if ($asc >= -16212 and $asc <= -15641) return "L";
if ($asc >= -15640 and $asc <= -15166) return "M";
if ($asc >= -15165 and $asc <= -14923) return "N";
if ($asc >= -14922 and $asc <= -14915) return "O";
if ($asc >= -14914 and $asc <= -14631) return "P";
if ($asc >= -14630 and $asc <= -14150) return "Q";
if ($asc >= -14149 and $asc <= -14091) return "R";
if ($asc >= -14090 and $asc <= -13319) return "S";
if ($asc >= -13318 and $asc <= -12839) return "T";
if ($asc >= -12838 and $asc <= -12557) return "W";
if ($asc >= -12556 and $asc <= -11848) return "X";
if ($asc >= -11847 and $asc <= -11056) return "Y";
if ($asc >= -11055 and $asc <= -10247) return "Z";
return 0; // null
}
/**
* 获得某天前的时间戳
*
* @param mixed $day
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function getxtime($day)
{
$day = intval($day);
return mktime(23, 59, 59, date("m"), date("d") - $day, date("y"));
}
/**
* 读文件
*/
public static function get_file($url, $timeout = 10)
{
if (function_exists('curl_init'))
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$content = curl_exec($ch);
curl_close($ch);
if ($content) return $content;
}
$ctx = stream_context_create(array('http' => array('timeout' => $timeout)));
$content = @file_get_contents($url, 0, $ctx);
if ($content) return $content;
return false;
}
/**
* 写文件,如果文件目录不存在,则递归生成
*/
public static function put_file($file, $content, $flag = 0)
{
$pathinfo = pathinfo($file);
if (!empty($pathinfo['dirname']))
{
if (file_exists($pathinfo['dirname']) === false)
{
if (@mkdir($pathinfo['dirname'], 0777, true) === false)
{
return false;
}
}
}
if ($flag === FILE_APPEND)
{
// 多个php-fpm写一个文件的时候容易丢失要加锁
//return @file_put_contents($file, $content, FILE_APPEND|LOCK_EX);
return @file_put_contents($file, $content, FILE_APPEND);
}
else
{
return @file_put_contents($file, $content, LOCK_EX);
}
}
/**
* 检查路径是否存在,不存在则递归生成路径
*
* @param mixed $path 路径
* @static
* @access public
* @return bool or string
*/
public static function path_exists($path)
{
$pathinfo = pathinfo($path . '/tmp.txt');
if (!empty($pathinfo['dirname']))
{
if (file_exists($pathinfo['dirname']) === false)
{
if (mkdir($pathinfo['dirname'], 0777, true) === false)
{
return false;
}
}
}
return $path;
}
/**
* 递归删除目录
*
* @param mixed $dir
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function deldir($dir)
{
//先删除目录下的文件:
$dh = opendir($dir);
while ($file = readdir($dh))
{
if($file!="." && $file!="..")
{
$fullpath = $dir."/".$file;
if(!is_dir($fullpath))
{
unlink($fullpath);
}
else
{
self::deldir($fullpath);
}
}
}
closedir($dh);
//删除当前文件夹:
if(rmdir($dir))
{
return true;
}
else
{
return false;
}
}
/**
* 递归修改目录权限
*
* @param mixed $path 目录
* @param mixed $filemode 权限
* @return bool
*/
public static function chmodr($path, $filemode)
{
if (!is_dir($path))
{
return @chmod($path, $filemode);
}
$dh = opendir($path);
while (($file = readdir($dh)) !== false)
{
if ($file != '.' && $file != '..')
{
$fullpath = $path . '/' . $file;
if (is_link($fullpath))
{
return FALSE;
}
elseif (!is_dir($fullpath) && !@chmod($fullpath, $filemode))
{
return FALSE;
}
elseif (!self::chmodr($fullpath, $filemode))
{
return FALSE;
}
}
}
closedir($dh);
if (@chmod($path, $filemode))
{
return TRUE;
}
else
{
return FALSE;
}
}
/**
* 数组格式化为CSV
*
* @param mixed $data
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-07-29 11:32
*/
public static function format_csv($data)
{
foreach ($data as $k=>$v)
{
$v = str_replace(",", "", $v);
$v = str_replace("", "", $v);
$data[$k] = $v;
}
return implode(",", $data);
}
/**
* 判断是否为utf8字符串
* @parem $str
* @return bool
*/
public static function is_utf8($str)
{
if ($str === mb_convert_encoding(mb_convert_encoding($str, "UTF-32", "UTF-8"), "UTF-8", "UTF-32"))
{
return true;
}
else
{
return false;
}
}
/**
* 获取文件编码
* @param $string
* @return string
*/
public static function get_encoding($string)
{
$encoding = mb_detect_encoding($string, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5'));
return strtolower($encoding);
}
/**
* 转换数组值的编码格式
* @param array $arr
* @param string $toEncoding
* @param string $fromEncoding
* @return array
*/
public static function array_iconv($arr, $from_encoding, $to_encoding)
{
eval('$arr = '.iconv($from_encoding, $to_encoding.'//IGNORE', var_export($arr,TRUE)).';');
return $arr;
}
/**
* 从普通时间返回Linux时间截(strtotime中文处理版)
* @parem string $dtime
* @return int
*/
public static function cn_strtotime($dtime)
{
if (!preg_match("/[^0-9]/", $dtime))
{
return $dtime;
}
$dtime = trim($dtime);
$dt = Array(1970, 1, 1, 0, 0, 0);
$dtime = preg_replace("/[\r\n\t]|日|秒/", " ", $dtime);
$dtime = str_replace("", "-", $dtime);
$dtime = str_replace("", "-", $dtime);
$dtime = str_replace("", ":", $dtime);
$dtime = str_replace("", ":", $dtime);
$dtime = trim(preg_replace("/[ ]{1,}/", " ", $dtime));
$ds = explode(" ", $dtime);
$ymd = explode("-", $ds[0]);
if (!isset($ymd[1]))
{
$ymd = explode(".", $ds[0]);
}
if (isset($ymd[0]))
{
$dt[0] = $ymd[0];
}
if (isset($ymd[1])) $dt[1] = $ymd[1];
if (isset($ymd[2])) $dt[2] = $ymd[2];
if (strlen($dt[0]) == 2) $dt[0] = '20' . $dt[0];
if (isset($ds[1]))
{
$hms = explode(":", $ds[1]);
if (isset($hms[0])) $dt[3] = $hms[0];
if (isset($hms[1])) $dt[4] = $hms[1];
if (isset($hms[2])) $dt[5] = $hms[2];
}
foreach ($dt as $k => $v)
{
$v = preg_replace("/^0{1,}/", '', trim($v));
if ($v == '')
{
$dt[$k] = 0;
}
}
$mt = mktime($dt[3], $dt[4], $dt[5], $dt[1], $dt[2], $dt[0]);
if (!empty($mt))
{
return $mt;
}
else
{
return strtotime($dtime);
}
}
public static function cn_substr($string, $length = 80, $etc = '...', $count_words = true)
{
mb_internal_encoding("UTF-8");
if ($length == 0) return '';
if (strlen($string) <= $length) return $string;
preg_match_all("/[\x01-\x7f]|[\xc2-\xdf][\x80-\xbf]|\xe0[\xa0-\xbf][\x80-\xbf]|[\xe1-\xef][\x80-\xbf][\x80-\xbf]|\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|[\xf1-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]/", $string, $info);
if ($count_words)
{
$j = 0;
$wordscut = "";
for ($i = 0; $i < count($info[0]); $i ++)
{
$wordscut .= $info[0][$i];
if (ord($info[0][$i]) >= 128)
{
$j = $j + 2;
}
else
{
$j = $j + 1;
}
if ($j >= $length)
{
return $wordscut . $etc;
}
}
return join('', $info[0]);
}
return join("", array_slice($info[0], 0, $length)) . $etc;
}
/**
* 获取文件后缀名
*
* @param mixed $file_name 文件名
* @static
*
* @access public
* @return string
*/
public static function get_extension($file_name)
{
$ext = explode('.', $file_name);
$ext = array_pop($ext);
return strtolower($ext);
}
// 获取 Url 跳转后的真实地址
public static function getrealurl($url)
{
if (empty($url))
{
return $url;
}
$header = get_headers($url, 1);
if (empty($header[0]) || empty($header[1]))
{
return $url;
}
if (strpos($header[0], '301') || strpos($header[0], '302'))
{
if (empty($header['Location']))
{
return $url;
}
if (is_array($header['Location']))
{
return $header['Location'][count($header['Location']) - 1];
}
else
{
return $header['Location'];
}
}
else
{
return $url;
}
}
// 解压服务器用 Content-Encoding:gzip 压缩过的数据
public static function gzdecode($data)
{
$flags = ord(substr($data, 3, 1));
$headerlen = 10;
$extralen = 0;
$filenamelen = 0;
if ($flags & 4)
{
$extralen = unpack('v', substr($data, 10, 2));
$extralen = $extralen[1];
$headerlen += 2 + $extralen;
}
if ($flags & 8) // Filename
$headerlen = strpos($data, chr(0), $headerlen) + 1;
if ($flags & 16) // Comment
$headerlen = strpos($data, chr(0), $headerlen) + 1;
if ($flags & 2) // CRC at end of file
$headerlen += 2;
$unpacked = @gzinflate(substr($data, $headerlen));
if ($unpacked === FALSE) $unpacked = $data;
return $unpacked;
}
/**
* 数字金额转换为中文
* @param string|integer|float $num 目标数字
* @param boolean $sim 使用小写(默认)
* @return string
*/
public static function number2chinese($num, $sim = FALSE)
{
if (!is_numeric($num)) return '含有非数字非小数点字符!';
$char = $sim ? array('零', '一', '二', '三', '四', '五', '六', '七', '八', '九') : array('零', '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖');
$unit = $sim ? array('', '十', '百', '千', '', '万', '亿', '兆') : array('', '拾', '佰', '仟', '', '萬', '億', '兆');
$retval = '';
$num = sprintf("%01.2f", $num);
list ($num, $dec) = explode('.', $num);
// 小数部分
if ($dec['0'] > 0)
{
$retval .= "{$char[$dec['0']]}";
}
if ($dec['1'] > 0)
{
$retval .= "{$char[$dec['1']]}";
}
// 整数部分
if ($num > 0)
{
$retval = "" . $retval;
$f = 1;
$str = strrev(intval($num));
for ($i = 0, $c = strlen($str); $i < $c; $i ++)
{
if ($str[$i] > 0)
{
$f = 0;
}
if ($f == 1 && $str[$i] == 0)
{
$out[$i] = "";
}
else
{
$out[$i] = $char[$str[$i]];
}
$out[$i] .= $str[$i] != '0' ? $unit[$i % 4] : '';
if ($i > 1 and $str[$i] + $str[$i - 1] == 0)
{
$out[$i] = '';
}
if ($i % 4 == 0)
{
$out[$i] .= $unit[4 + floor($i / 4)];
}
}
$retval = join('', array_reverse($out)) . $retval;
}
return $retval;
}
public static function colorize($str, $status = "info")
{
$out = "";
switch ($status)
{
case 'succ':
$out = "\033[32m"; // Blue
break;
case "error":
$out = "\033[31m"; // Red
break;
case "warn":
$out = "\033[33m"; // Yellow
break;
case "note":
$out = "\033[34m"; // Green
break;
case "debug":
$out = "\033[36m"; // Green
break;
default:
$out = "\033[0m"; // info
break;
}
return $out.$str."\033[0m";
}
public static function node_to_array($dom, $node)
{
if(!is_a( $dom, 'DOMDocument' ) || !is_a( $node, 'DOMNode' ))
{
return false;
}
$array = array();
// Discard empty nodes
$localName = trim( $node->localName );
if( empty($localName))
{
return false;
}
if( XML_TEXT_NODE == $node->nodeType )
{
return $node->nodeValue;
}
foreach ($node->attributes as $attr)
{
$array['@'.$attr->localName] = $attr->nodeValue;
}
foreach ($node->childNodes as $childNode)
{
if ( (isset($childNode->childNodes->length) && 1 == $childNode->childNodes->length) &&
XML_TEXT_NODE == $childNode->firstChild->nodeType )
{
$array[$childNode->localName] = $childNode->nodeValue;
}
else
{
if( false !== ($a = self::node_to_array( $dom, $childNode)))
{
$array[$childNode->localName] = $a;
}
}
}
return $array;
}
public static function is_win()
{
return strtoupper(substr(PHP_OS,0,3))==="WIN";
}
/**
* 和 http_build_query 相反,分解出参数
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-05-16 17:29
*/
public static function http_split_query($query, $is_query = false)
{
if (!$is_query)
{
$parse_arr = parse_url($query);
if (empty($parse_arr['query']))
{
return array();
}
$query = $parse_arr['query'];
}
$query_arr = explode("&", $query);
$params = array();
foreach ($query_arr as $val)
{
$arr = explode("=", $val);
$params[$arr[0]] = $arr[1];
}
return $params;
}
}

View File

@ -0,0 +1,421 @@
<?php
// +----------------------------------------------------------------------
// | PHPSpider [ A PHP Framework For Crawler ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: Seatle Yang <seatle@foxmail.com>
// +----------------------------------------------------------------------
//----------------------------------
// Worker多进程操作类
//----------------------------------
class worker
{
// worker进程数
public $count = 0;
// worker idworker进程从1开始0被master进程所使用
public $worker_id = 0;
// worker 进程ID
public $worker_pid = 0;
// 进程用户
public $user = '';
// 进程名
public $title = '';
// 每个进程是否只运行一次
public $run_once = true;
// 是否输出日志
public $log_show = false;
// master进程启动回调
public $on_start = false;
// master进程停止回调
public $on_stop = false;
// worker进程启动回调
public $on_worker_start = false;
// worker进程停止回调
public $on_worker_stop = false;
// master进程ID
protected static $_master_pid = 0;
// worker进程ID
protected static $_worker_pids = array();
// master、worker进程启动时间
public $time_start = 0;
// master、worker进程运行状态 [starting|running|shutdown|reload]
protected static $_status = "starting";
public function __construct()
{
self::$_master_pid = posix_getpid();
// 产生时钟云,添加后父进程才可以收到信号
declare(ticks = 1);
$this->install_signal();
}
/**
* 安装信号处理函数
* @return void
*/
protected function install_signal()
{
// stop
pcntl_signal(SIGINT, array($this, 'signal_handler'), false);
// reload
pcntl_signal(SIGUSR1, array($this, 'signal_handler'), false);
// status
pcntl_signal(SIGUSR2, array($this, 'signal_handler'), false);
// ignore
pcntl_signal(SIGPIPE, SIG_IGN, false);
// install signal handler for dead kids
// pcntl_signal(SIGCHLD, array($this, 'signal_handler'));
}
/**
* 卸载信号处理函数
* @return void
*/
protected function uninstall_signal()
{
// uninstall stop signal handler
pcntl_signal(SIGINT, SIG_IGN, false);
// uninstall reload signal handler
pcntl_signal(SIGUSR1, SIG_IGN, false);
// uninstall status signal handler
pcntl_signal(SIGUSR2, SIG_IGN, false);
}
/**
* 信号处理函数会被其他类调用到所以要设置为public
* @param int $signal
*/
public function signal_handler($signal) {
switch ($signal) {
// stop 2
case SIGINT:
// master进程和worker进程都会调用
$this->stop_all();
break;
// reload 30
case SIGUSR1:
echo "reload\n";
break;
// show status 31
case SIGUSR2:
echo "status\n";
break;
}
}
/**
* 运行worker实例
*/
public function run()
{
$this->time_start = microtime(true);
$this->worker_id = 0;
$this->worker_pid = posix_getpid();
$this->set_process_title($this->title);
// 这里赋值worker进程也会克隆到
if ($this->log_show)
{
log::$log_show = true;
}
if ($this->on_start)
{
call_user_func($this->on_start, $this);
}
// worker进程从1开始0被master进程所使用
for ($i = 1; $i <= $this->count; $i++)
{
$this->fork_one_worker($i);
}
$this->monitor_workers();
}
/**
* 创建一个子进程
* @param Worker $worker
* @throws Exception
*/
public function fork_one_worker($worker_id)
{
//$sockets = stream_socket_pair(STREAM_PF_UNIX, STREAM_SOCK_STREAM, STREAM_IPPROTO_IP);
$pid = pcntl_fork();
// 主进程记录子进程pid
if($pid > 0)
{
self::$_worker_pids[$worker_id] = $pid;
}
// 子进程运行
elseif(0 === $pid)
{
$this->time_start = microtime(true);
$this->worker_id = $worker_id;
$this->worker_pid = posix_getpid();
$this->set_process_title($this->title);
$this->set_process_user($this->user);
// 清空master进程克隆过来的worker进程ID
self::$_worker_pids = array();
//$this->uninstall_signal();
// 设置worker进程的运行状态为运行中
self::$_status = "running";
// 注册进程退出回调,用来检查是否有错误(子进程里面注册)
register_shutdown_function(array($this, 'check_errors'));
// 如果设置了worker进程启动回调函数
if ($this->on_worker_start)
{
call_user_func($this->on_worker_start, $this);
}
// 停止当前worker实例
$this->stop();
// 这里用0表示正常退出
exit(0);
}
else
{
log::add("fork one worker fail", "Error");
exit;
}
}
/**
* 尝试设置运行当前进程的用户
*
* @param $user_name
*/
protected static function set_process_user($user_name)
{
// 用户名为空 或者 当前用户不是root用户
if(empty($user_name) || posix_getuid() !== 0)
{
return;
}
$user_info = posix_getpwnam($user_name);
if($user_info['uid'] != posix_getuid() || $user_info['gid'] != posix_getgid())
{
if(!posix_setgid($user_info['gid']) || !posix_setuid($user_info['uid']))
{
log::add('Can not run woker as '.$user_name." , You shuld be root", "Error");
}
}
}
/**
* 设置当前进程的名称在ps aux命令中有用
* 注意 需要php>=5.5或者安装了protitle扩展
* @param string $title
* @return void
*/
protected function set_process_title($title)
{
if (!empty($title))
{
// 需要扩展
if(extension_loaded('proctitle') && function_exists('setproctitle'))
{
@setproctitle($title);
}
// >=php 5.5
elseif (function_exists('cli_set_process_title'))
{
cli_set_process_title($title);
}
}
}
/**
* 监控所有子进程的退出事件及退出码
* @return void
*/
public function monitor_workers()
{
// 设置master进程的运行状态为运行中
self::$_status = "running";
while(1)
{
// pcntl_signal_dispatch 子进程无法接受到信号
// 如果有信号到来,尝试触发信号处理函数
//pcntl_signal_dispatch();
// 挂起进程,直到有子进程退出或者被信号打断
$status = 0;
$pid = pcntl_wait($status, WUNTRACED);
// 如果有信号到来,尝试触发信号处理函数
//pcntl_signal_dispatch();
// 子进程退出信号
if($pid > 0)
{
//echo "worker[".$pid."] stop\n";
//$this->stop();
// 如果不是正常退出是被kill等杀掉的
if($status !== 0)
{
log::add("worker {$pid} exit with status $status", "Warning");
}
// key 和 value 互换
$worker_pids = array_flip(self::$_worker_pids);
// 通过 pid 得到 worker_id
$worker_id = $worker_pids[$pid];
// 这里不unset掉是为了进程重启
self::$_worker_pids[$worker_id] = 0;
//unset(self::$_worker_pids[$pid]);
// 再生成一个worker
if (!$this->run_once)
{
$this->fork_one_worker($worker_id);
}
// 如果所有子进程都退出了,触发主进程退出函数
$all_worker_stop = true;
foreach (self::$_worker_pids as $_worker_pid)
{
// 只要有一个worker进程还存在进程ID就不算退出
if ($_worker_pid != 0)
{
$all_worker_stop = false;
}
}
if ($all_worker_stop)
{
if ($this->on_stop)
{
call_user_func($this->on_stop, $this);
}
exit(0);
}
}
// 其他信号
else
{
// worker进程接受到master进行信号退出的会到这里来
if ($this->on_stop)
{
call_user_func($this->on_stop, $this);
}
exit(0);
}
}
}
/**
* 执行关闭流程(所有进程)
* 事件触发,非正常程序执行完毕
* @return void
*/
public function stop_all()
{
// 设置master、worker进程的运行状态为关闭状态
self::$_status = "shutdown";
// master进程
if(self::$_master_pid === posix_getpid())
{
// 循环给worker进程发送关闭信号
foreach (self::$_worker_pids as $worker_pid)
{
posix_kill($worker_pid, SIGINT);
}
}
// worker进程
else
{
// 接收到master进程发送的关闭信号之后退出这里应该考虑业务的完整性不能强行exit
$this->stop();
exit(0);
}
}
/**
* 停止当前worker实例
* 正常运行结束和接受信号退出,都会调用这个方法
* @return void
*/
public function stop()
{
if ($this->on_worker_stop)
{
call_user_func($this->on_worker_stop, $this);
}
// 设置worker进程的运行状态为关闭
self::$_status = "shutdown";
}
/**
* 检查错误PHP exit之前会执行
* @return void
*/
public function check_errors()
{
// 如果当前worker进程不是正常退出
if(self::$_status != "shutdown")
{
$error_msg = "WORKER EXIT UNEXPECTED ";
$errors = error_get_last();
if($errors && ($errors['type'] === E_ERROR ||
$errors['type'] === E_PARSE ||
$errors['type'] === E_CORE_ERROR ||
$errors['type'] === E_COMPILE_ERROR ||
$errors['type'] === E_RECOVERABLE_ERROR ))
{
$error_msg .= $this->get_error_type($errors['type']) . " {$errors['message']} in {$errors['file']} on line {$errors['line']}";
}
log::add($error_msg, 'Error');
}
}
/**
* 获取错误类型对应的意义
* @param integer $type
* @return string
*/
protected function get_error_type($type)
{
switch($type)
{
case E_ERROR: // 1 //
return 'E_ERROR';
case E_WARNING: // 2 //
return 'E_WARNING';
case E_PARSE: // 4 //
return 'E_PARSE';
case E_NOTICE: // 8 //
return 'E_NOTICE';
case E_CORE_ERROR: // 16 //
return 'E_CORE_ERROR';
case E_CORE_WARNING: // 32 //
return 'E_CORE_WARNING';
case E_COMPILE_ERROR: // 64 //
return 'E_COMPILE_ERROR';
case E_COMPILE_WARNING: // 128 //
return 'E_COMPILE_WARNING';
case E_USER_ERROR: // 256 //
return 'E_USER_ERROR';
case E_USER_WARNING: // 512 //
return 'E_USER_WARNING';
case E_USER_NOTICE: // 1024 //
return 'E_USER_NOTICE';
case E_STRICT: // 2048 //
return 'E_STRICT';
case E_RECOVERABLE_ERROR: // 4096 //
return 'E_RECOVERABLE_ERROR';
case E_DEPRECATED: // 8192 //
return 'E_DEPRECATED';
case E_USER_DEPRECATED: // 16384 //
return 'E_USER_DEPRECATED';
}
return "";
}
}

20
vendor/owner888/phpspider/gitadd.sh vendored Normal file
View File

@ -0,0 +1,20 @@
#!/bin/bash
if [ ! -d "$1" ] && [ ! -f "$1" ]; then
echo "file $1 not exists"
exit
fi
filename=$1
comment="add file"
if [[ $2 != "" ]]; then
comment=$2
fi
echo "start update..."
git pull
echo "start add new file..."
git add $filename
echo "start commit..."
git commit -m "$comment" $filename
git push -u origin master
echo "git commit complete..."

View File

@ -0,0 +1,129 @@
_-o#&&*''''?d:>b\_
_o/"`'' '',, dMF9MMMMMHo_
.o&#' `"MbHMMMMMMMMMMMHo.
.o"" ' vodM*$&&HMMMMMMMMMM?.
,' $M&ood,~'`(&##MMMMMMH\
/ ,MMMMMMM#b?#bobMMMMHMMML
& ?MMMMMMMMMMMMMMMMM7MMM$R*Hk
?$. :MMMMMMMMMMMMMMMMMMM/HMMM|`*L
| |MMMMMMMMMMMMMMMMMMMMbMH' T,
$H#: `*MMMMMMMMMMMMMMMMMMMMb#]' `?
]MMH# ""*""""*#MMMMMMMMMMMMM' -
MMMMMb_ |MMMMMMMMMMMP' :
HMMMMMMMHo `MMMMMMMMMT .
?MMMMMMMMP 9MMMMMMMM] -
-?MMMMMMM |MMMMMMMMM?,d- ' {Name}
:|MMMMMM- `MMMMMMMT .M|. : {Description}
.9MMM[ &MMMMM*' `' . {Loaded}
:9MMk `MMM#" -
&M] ` .-
`&. .
`~, . ./
. _ .-
'`--._,dd###pp=""'
$$$$$AnyShIt$$$$$$
_v->#H#P? "':o<>\_
.,dP` `'' "'-o.+H6&MMMHo_
oHMH9' `?&bHMHMMMMMMHo.
oMP"' ' ooMP*#&HMMMMMMM?.
,M* - `*MSdob//`^&##MMMH\
d*' .,MMMMMMH#o>#ooMMMMMb
HM- :HMMMMMMMMMMMMMMM&HM[R\
d"Z\. 9MMMMMMMMMMMMMMMMM[HMM|:
-H - MMMMMMMMMMMMMMMMMMMbMP' :
:??Mb# `9MMMMMMMMMMMMMMMMMMH#! .
: MMMMH#, "*""""`#HMMMMMMMMMMH -
||MMMMMM6\. [MMMMMMMMMH' :
:|MMMMMMMMMMHo `9MMMMMMMM' .
. HMMMMMMMMMMP' !MMMMMMMM `
- `#MMMMMMMMM HMMMMMMM*,/ :
: ?MMMMMMMF HMMMMMM',P' : {Name}
. HMMMMR' [MMMMP' ^' - {Description}
: `HMMMT iMMH' .' {Loaded}
-.`HMH .
-:*H . '
-`\,, . .-
' . _ .-`
'`~\.__,obb#q==~'''
$$$$$AnyShIt$$$$$$
_ood>H&H&Z?#M#b-\.
.\HMMMMMR?`\M6b."`' ''``v.
.. .MMMMMMMMMMHMMM#&. ``~o.
. ,HMMMMMMMMMMMM*"'-` &b.
. .MMMMMMMMMMMMH' `"&\
- RMMMMM#H##R' 4Mb
- |7MMM' ?:: `|MMb
/ HMM__#|`"\>?v.. `MMML
. `"'#Hd| ` 9MMM:
- |\,\?HH#bbL `9MMb
: !MMMMMMMH#b, `""T
. . ,MMMMMMMMMMMbo. |
: 4MMMMMMMMMMMMMMMHo |
: ?MMMMMMMMMMMMMMM? :
-. `#MMMMMMMMMMMM: .-
: |MMMMMMMMMM? .
- JMMMMMMMT' : {Name}
`. MMMMMMH' - {Description}
-. |MMM#*` - {Loaded}
. HMH' . '
-. #H:. .-
` . .\ .-
'-..-+oodHL_,--/-`
$$$$$AnyShIt$$$$$$
.,:,#&6dHHHb&##o\_
.oHHMMMMMMMMMMMMMMMMMH*\,.
oHMMMMMMMMMMMMMMMMMMMMMMHb:'-.
.dMMMMMMMMMMMMMMMMMMMMMMMMMH|\/' .
,&HMMMMMMMMMMMMMMMMMMMMMMM/"&.,d. -.
dboMMHMMMMMMMMMMMMMMMMMMMMMML `' .
HMHMMM$Z***MMMMMMMMMMMMMMMMMM|.- .
dMM]MMMM#' `9MMMH?"`MMMMR'T' _ :
|MMMbM#'' |MM" ``MMMH. <_ .
dMMMM#& *&. .?`*" .'&: .
MMMMMH- `' -v/H .dD "' ' :
MMMM* `*M: 4MM*::-!v,_ :
MMMM `*?::" "'``"?9Mb::. :
&MMM, `"'"'|"._ "?`| - :
`MMM].H ,#dM[_H ..:
9MMi`M: . .ooHMMMMMMM, ..
9Mb `- 1MMMMMMMMMM| : {Name}
?M |MM#*#MMMM* . {Description}
-. ` |#"' ,' {Loaded}
. -" v`
-. .-
- . . `
'-*#d#HHMMMMHH#"-'
$$$$$AnyShIt$$$$$$
.-:?,Z?:&$dHH##b\_
,:bqRMMMMMMMMMMMMMMMMMHo.
.?HHHMMMMMMMMMMMMMMMMMMMMMMMHo.
-o/*M9MMMMMMMMMMMMMMMMMMMMMMMMMMMv
.:H\b\'|?#HHMMMMMMMMMMMMMMMMMMMMMM6?Z\
.?MMMHbdbbodMMMMHMMMMMMMMMMMMMMMMMMMM\':
:MMMMMMMMMMM7MMMMb?6P**#MMMMMMMMMMMMMMM_ :
\MMMMMMMMMMMMb^MMMMMM? `*MMMM*"`MMMR<' . -
.1MMMMMMMMMMMMMb]M#"" 9MR' `?MMb \. :
-MMMMMMMMMMMMMMMH##|` *&. |`*' .\ .
-?""*MMMMMMMMMMMMM' ' |?b ,]" :
: MMMMMMMMMMH' `M_|M]r\?
. `MMMMMMMMM' `$_:`'"H
- TMMMMMMMM, '"``::
: [MMMMMMMM| oH| .#M-
: `9MMMMMM' .MP . ,oMMT
. HMMMMP' `' ,MMMP {Name}
- `MMH' HH9* {Description}
'. ` ` .' {Loaded}
- . '
` . - .-
` . .-
' -==pHMMH##HH#"""

View File

@ -0,0 +1,49 @@
<?php
ini_set("memory_limit", "10240M");
require_once __DIR__ . '/../autoloader.php';
use phpspider\core\requests;
use phpspider\core\selector;
/* Do NOT delete this comment */
/* 不要删除这段注释 */
hacked_emails::random_banner();
exit;
class hacked_emails
{
// Colors
// green - yellow - blue - red - white - magenta - cyan - reset
public static $color_g = "\033[92m";
public static $color_y = "\033[93m";
public static $color_b = "\033[94m";
public static $color_r = "\033[91m";
public static $color_w = "\033[0m";
public static $color_m = "\x1b[35m";
public static $color_c = "\x1b[36m";
public static $end = "\x1b[39m";
public static $bold = "\033[1m";
public static function random_banner()
{
$banners = file_get_contents("banners.txt");
$banners = explode('$$$$$AnyShIt$$$$$$', $banners);
$banner = $banners[count($banners)-1];
$banner_to_print = self::$color_g;
$banner_to_print .= $banner;
$banner_to_print .= self::$end;
$name = self::$color_b."Hacked Emails By ".self::$bold."@seatle -".self::$color_m." V0.1".self::$color_g;
$banner_to_print = str_replace("{Name}", $name, $banner_to_print);
$description = self::$color_c."Know the dangers of email credentials reuse attacks.".self::$color_g;
$banner_to_print = str_replace("{Description}", $description, $banner_to_print);
$loaded = self::$color_b."Loaded ".self::$color_y."14".self::$color_b." website.".self::$color_g;
$banner_to_print = str_replace("{Loaded}", $loaded, $banner_to_print);
echo $banner_to_print;
}
}
$html = requests::get('http://www.qiushibaike.com/article/118914171');
//echo $html;
//exit;
$data = selector::select($html, "div.author", "css");
echo $data;

View File

@ -0,0 +1,425 @@
<?php
/**
* Worker多进程操作类
*
* Licensed under The MIT License
* For full copyright and license information, please see the MIT-LICENSE.txt
* Redistributions of files must retain the above copyright notice.
*
* @author seatle<seatle@foxmail.com>
* @copyright seatle<seatle@foxmail.com>
* @link http://www.epooll.com/
* @license http://www.opensource.org/licenses/mit-license.php MIT License
*/
class cls_curl
{
protected static $timeout = 10;
protected static $ch = null;
protected static $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
protected static $http_raw = false;
protected static $cookie = null;
protected static $cookie_jar = null;
protected static $cookie_file = null;
protected static $referer = null;
protected static $ip = null;
protected static $proxy = null;
protected static $headers = array();
protected static $hosts = array();
protected static $gzip = false;
protected static $info = array();
/**
* set timeout
*
* @param init $timeout
* @return
*/
public static function set_timeout($timeout)
{
self::$timeout = $timeout;
}
/**
* 设置代理
*
* @param mixed $proxy
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function set_proxy($proxy)
{
self::$proxy = $proxy;
}
/**
* set referer
*
*/
public static function set_referer($referer)
{
self::$referer = $referer;
}
/**
* 设置 user_agent
*
* @param string $useragent
* @return void
*/
public static function set_useragent($useragent)
{
self::$useragent = $useragent;
}
/**
* 设置COOKIE
*
* @param string $cookie
* @return void
*/
public static function set_cookie($cookie)
{
self::$cookie = $cookie;
}
/**
* 设置COOKIE JAR
*
* @param string $cookie_jar
* @return void
*/
public static function set_cookie_jar($cookie_jar)
{
self::$cookie_jar = $cookie_jar;
}
/**
* 设置COOKIE FILE
*
* @param string $cookie_file
* @return void
*/
public static function set_cookie_file($cookie_file)
{
self::$cookie_file = $cookie_file;
}
/**
* 获取内容的时候是不是连header也一起获取
*
* @param mixed $http_raw
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function set_http_raw($http_raw)
{
self::$http_raw = $http_raw;
}
/**
* 设置IP
*
* @param string $ip
* @return void
*/
public static function set_ip($ip)
{
self::$ip = $ip;
}
/**
* 设置Headers
*
* @param string $headers
* @return void
*/
public static function set_headers($headers)
{
self::$headers = $headers;
}
/**
* 设置Hosts
*
* @param string $hosts
* @return void
*/
public static function set_hosts($hosts)
{
self::$hosts = $hosts;
}
/**
* 设置Gzip
*
* @param string $hosts
* @return void
*/
public static function set_gzip($gzip)
{
self::$gzip = $gzip;
}
/**
* 初始化 CURL
*
*/
public static function init()
{
//if (empty ( self::$ch ))
if (!is_resource ( self::$ch ))
{
self::$ch = curl_init ();
curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout );
curl_setopt( self::$ch, CURLOPT_HEADER, false );
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5);
// 在多线程处理场景下使用超时选项时会忽略signals对应的处理函数但是无耐的是还有小概率的crash情况发生
curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true);
}
return self::$ch;
}
/**
* get
*
*
*/
public static function get($url, $fields = array())
{
self::init ();
return self::http_request($url, 'get', $fields);
}
/**
* $fields 有三种类型:1、数组2、http query3、json
* 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao'))
* 前两种是普通的post可以用$_POST方式获取
* 第三种是post stream( json rpc其实就是webservice )虽然是post方式但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
*
* @param mixed $url
* @param array $fields
* @param mixed $proxy
* @static
* @access public
* @return void
*/
public static function post($url, $fields = array())
{
self::init ();
return self::http_request($url, 'post', $fields);
}
public static function http_request($url, $type = 'get', $fields)
{
// 如果是 get 方式,直接拼凑一个 url 出来
if (strtolower($type) == 'get' && !empty($fields))
{
$url = $url . (strpos($url,"?")===false ? "?" : "&") . http_build_query($fields);
}
// 随机绑定 hosts做负载均衡
if (self::$hosts)
{
$parse_url = parse_url($url);
$host = $parse_url['host'];
$key = rand(0, count(self::$hosts)-1);
$ip = self::$hosts[$key];
$url = str_replace($host, $ip, $url);
self::$headers = array_merge( array('Host:'.$host), self::$headers );
}
curl_setopt( self::$ch, CURLOPT_URL, $url );
// 如果是 post 方式
if (strtolower($type) == 'post')
{
curl_setopt( self::$ch, CURLOPT_POST, true );
curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields );
}
if (self::$useragent)
{
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
}
if (self::$cookie)
{
curl_setopt( self::$ch, CURLOPT_COOKIE, self::$cookie );
}
if (self::$cookie_jar)
{
curl_setopt( self::$ch, CURLOPT_COOKIEJAR, self::$cookie_jar );
}
if (self::$cookie_file)
{
curl_setopt( self::$ch, CURLOPT_COOKIEFILE, self::$cookie_file );
}
if (self::$referer)
{
curl_setopt( self::$ch, CURLOPT_REFERER, self::$referer );
}
if (self::$ip)
{
self::$headers = array_merge( array('CLIENT-IP:'.self::$ip, 'X-FORWARDED-FOR:'.self::$ip), self::$headers );
}
if (self::$headers)
{
curl_setopt( self::$ch, CURLOPT_HTTPHEADER, self::$headers );
}
if (self::$gzip)
{
curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' );
}
if (self::$proxy)
{
curl_setopt( self::$ch, CURLOPT_PROXY, self::$proxy );
}
if (self::$http_raw)
{
curl_setopt( self::$ch, CURLOPT_HEADER, true );
}
$data = curl_exec ( self::$ch );
self::$info = curl_getinfo(self::$ch);
if ($data === false)
{
//echo date("Y-m-d H:i:s"), ' Curl error: ' . curl_error( self::$ch ), "\n";
}
// 关闭句柄
curl_close( self::$ch );
//$data = substr($data, 10);
//$data = gzinflate($data);
return $data;
}
public static function get_info()
{
return self::$info;
}
public static function get_http_code()
{
return self::$info['http_code'];
}
}
function classic_curl($urls, $delay)
{
$queue = curl_multi_init();
$map = array();
foreach ($urls as $url)
{
// create cURL resources
$ch = curl_init();
// 设置 URL 和 其他参数
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
// 把当前 curl resources 加入到 curl_multi_init 队列
curl_multi_add_handle($queue, $ch);
$map[$url] = $ch;
}
$active = null;
// execute the handles
do {
$mrc = curl_multi_exec($queue, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active > 0 && $mrc == CURLM_OK) {
while (curl_multi_exec($queue, $active) === CURLM_CALL_MULTI_PERFORM);
// 这里 curl_multi_select 一直返回 -1所以这里就死循环了CPU就100%了
if (curl_multi_select($queue, 0.5) != -1)
{
do {
$mrc = curl_multi_exec($queue, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
$responses = array();
foreach ($map as $url=>$ch) {
//$responses[$url] = callback(curl_multi_getcontent($ch), $delay);
$responses[$url] = callback(curl_multi_getcontent($ch), $delay, $url);
curl_multi_remove_handle($queue, $ch);
curl_close($ch);
}
curl_multi_close($queue);
return $responses;
}
function rolling_curl($urls, $delay)
{
$queue = curl_multi_init();
$map = array();
foreach ($urls as $url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
$cookie = '_za=36643642-e546-4d60-a771-8af8dcfbd001; q_c1=a57a2b9f10964f909b8d8969febf3ab2|1437705596000|1437705596000; _xsrf=f0304fba4e44e1d008ec308d59bab029; cap_id="YWY1YmRmODlmZGVmNDc3MWJlZGFkZDg3M2E0M2Q5YjM=|1437705596|963518c454bb6f10d96775021c098c84e1e46f5a"; z_c0="QUFCQVgtRWZBQUFYQUFBQVlRSlZUVjR6NEZVUTgtRkdjTVc5UDMwZXRJZFdWZ2JaOWctNVhnPT0=|1438164574|aed6ef3707f246a7b64da4f1e8c089395d77ff2b"; __utma=51854390.1105113342.1437990174.1438160686.1438164116.10; __utmc=51854390; __utmz=51854390.1438134939.8.5.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/yangzetao; __utmv=51854390.100-1|2=registration_date=20131030=1^3=entry_date=20131030=1';
curl_setopt($ch, CURLOPT_COOKIE, $cookie);
$useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
curl_setopt( $ch, CURLOPT_USERAGENT, $useragent );
curl_setopt($ch, CURLOPT_ENCODING, 'gzip');
curl_multi_add_handle($queue, $ch);
$map[(string) $ch] = $url;
}
$responses = array();
do {
while (($code = curl_multi_exec($queue, $active)) == CURLM_CALL_MULTI_PERFORM) ;
if ($code != CURLM_OK) { break; }
// a request was just completed -- find out which one
while ($done = curl_multi_info_read($queue)) {
// get the info and content returned on the request
$info = curl_getinfo($done['handle']);
$error = curl_error($done['handle']);
$results = callback(curl_multi_getcontent($done['handle']), $delay, $map[(string) $done['handle']]);
$responses[$map[(string) $done['handle']]] = compact('info', 'error', 'results');
// remove the curl handle that just completed
curl_multi_remove_handle($queue, $done['handle']);
curl_close($done['handle']);
}
// Block for data in / output; error handling is done by curl_multi_exec
if ($active > 0) {
curl_multi_select($queue, 0.5);
}
} while ($active);
curl_multi_close($queue);
return $responses;
}
function callback($data, $delay, $url) {
//echo $data;
//echo date("Y-m-d H:i:s", time()) . " --- " . $url . "\n";
if (!empty($data))
{
file_put_contents("./html2/".md5($url).".html", $data);
}
// usleep模拟现实中比较负责的数据处理逻辑(如提取, 分词, 写入文件或数据库等)
//usleep(1);
//return compact('data', 'matches');
}

View File

@ -0,0 +1,248 @@
<?php
class cls_query
{
private static $content;
public static $debug = false;
public static function init($content)
{
self::$content = $content;
}
public static function query($query, $attr = "html")
{
$nodes = self::get_nodes($query);
$datas = self::get_datas($nodes, $attr);
return $datas;
}
protected static function is_char($char) {
return extension_loaded('mbstring') ? mb_eregi('\w', $char) : preg_match('@\w@', $char);
}
/**
* 从xpath中得到节点
*
* @param mixed $xpath
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2015-08-08 15:52
*/
private static function get_nodes($query)
{
// 把一到多个空格 替换成 一个空格
// 把 > 和 ~ 符号两边的空格去掉,因为没有用这两个符号,所以这里可以不这么做
// ul>li.className
$query = trim(
preg_replace('@\s+@', ' ',
preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query)
)
);
$nodes = array();
if (! $query)
{
return $nodes;
}
$query_arr = explode(" ", $query);
foreach ($query_arr as $k=>$v)
{
$path = $k == 0 ? $v : $path.' '.$v;
$node = array("path"=>(string)$path, "name"=>"", "id"=>"", "class"=>"", "other"=>array());
// 如果存在内容选择器
if (preg_match('@(.*?)\[(.*?)=[\'|"](.*?)[\'|"]\]@', $v, $matches) && !empty($matches[2]) && !empty($matches[3]))
{
// 把选择器过滤掉 [rel='topic']
$v = $matches[1];
$node['other'] = array(
'key'=>$matches[2],
'val'=>$matches[3],
);
}
// 如果存在 id
$id_arr = explode("#", $v);
$class_arr = explode(".", $v);
if (count($id_arr) === 2)
{
$node['name'] = $id_arr[0];
$node['id'] = $id_arr[1];
}
// 如果存在 class
elseif (count($class_arr) === 2)
{
$node['name'] = $class_arr[0];
$node['class'] = $class_arr[1];
}
// 如果没有样式
else
{
$node['name'] = $v;
}
$nodes[] = $node;
}
//print_r($nodes);
//exit;
return $nodes;
}
public static function get_datas($nodes, $attr = "html")
{
if (empty(self::$content))
{
return false;
}
$node_datas = array();
$count = count($nodes);
// 循环所有节点
foreach ($nodes as $i=>$node)
{
$is_last = $count == $i+1 ? true : false;
// 第一次
if ($i == 0)
{
$datas = array();
$datas = self::get_node_datas($node, self::$content, $attr, $is_last);
// 如果第一次都取不到数据,直接跳出循环
if(!$datas)
{
break;
}
$node_datas[$nodes[$i]['path']] = $datas;
}
else
{
$datas = array();
// 循环上一个节点的数组
foreach ($node_datas[$nodes[$i-1]['path']] as $v)
{
$datas = array_merge( $datas, self::get_node_datas($node, trim($v), $attr, $is_last) );
}
$node_datas[$nodes[$i]['path']] = $datas;
// 删除上一个节点,防止内存溢出,或者缓存到本地,再次使用?!
unset($node_datas[$nodes[$i-1]['path']]);
}
}
//print_r($datas);exit;
// 从数组中弹出最后一个元素
$node_datas = array_pop($node_datas);
//print_r($node_datas);
//exit;
return $node_datas;
}
/**
* 从节点中获取内容
* $regex = '@<meta[^>]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i';
*
* @param mixed $node
* @param mixed $content
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2015-08-08 15:52
*/
private static function get_node_datas($node, $content, $attr = "html", $is_last = false)
{
$node_datas = $datas = array();
if (!empty($node['id']))
{
if ($node['name'])
$regex = '@<'.$node['name'].'[^>]+id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)</'.$node['name'].'>@is';
else
$regex = '@id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)<@is';
}
elseif (!empty($node['class']))
{
if ($node['name'])
$regex = '@<'.$node['name'].'[^>]+class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)</'.$node['name'].'>@is';
else
$regex = '@class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)<@is';
}
else
{
// 这里为是么是*0次到多次因为有可能是 <li>
$regex = '@<'.$node['name'].'[^>]*?>(.*?)</'.$node['name'].'>@is';
}
self::log("regex --- " . $regex);;
preg_match_all($regex, $content, $matches);
$all_datas = empty($matches[0]) ? array() : $matches[0];
$html_datas = empty($matches[1]) ? array() : $matches[1];
// 过滤掉选择器对不上的
foreach ($all_datas as $i=>$data)
{
// 如果有设置其他选择器,验证一下选择器
if (!empty($node['other']))
{
$regex = '@'.$node['other']['key'].'=[\'|"]'.$node['other']['val'].'[\'|"]@is';
self::log("regex other --- " . $regex);
// 过滤器对不上的,跳过
if (!preg_match($regex, $data, $matches))
{
continue;
}
}
// 获取节点的html内容
if ($attr != "html" && $is_last)
{
$regex = '@'.$attr.'=[\'|"](.*?)[\'|"]@is';
preg_match($regex, $data, $matches);
$node_datas[] = empty($matches[1]) ? '' : trim($matches[1]);
}
// 获取节点属性名的值
else
{
$node_datas[] = trim($html_datas[$i]);
}
}
//echo " 11111 ========================================= \n";
//print_r($node_datas);
//echo " 22222 ========================================= \n\n\n";
return $node_datas;
}
/**
* 记录日志
* @param string $msg
* @return void
*/
private static function log($msg)
{
$msg = "[".date("Y-m-d H:i:s")."] " . $msg . "\n";
if (self::$debug)
{
echo $msg;
}
}
}
//$xpath = "ul.top-nav-dropdown li";
//$xpath = "i.zg-icon";
//print_r($nodes);
//exit;
// [^>]+ 不是>的字符重复一次到多次, ? 表示不贪婪
// \s 表示空白字符
// * 表示0次或者多次
// + 表示1次或者多次
//
// 后向引用,表示表达式中,从左往右数,第一个左括号对应的括号内的内容。
// \\0 表示整个表达式
// \\1表示第1个表达式
// \\2表示第2个表达式
// $regex = '@<meta[^>]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i';
//preg_match_all($regex, $content, $matches);
//print_r($matches);
//exit;
// 用法
//$content = file_get_contents("./test.html");
//$query = "ul#top-nav-profile-dropdown li a";
//$query = "div#zh-profile-following-topic a.link[href='/topic/19550937']";
//cls_query::init($content);
//$list = cls_query::query($query, "href");
//print_r($list);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,121 @@
<?php
/**
* redis 客户端
* redis的协议可参考这个文章http://redis.cn/topics/protocol.html
*
* @version 2.7.0
* @copyright 1997-2018 The PHP Group
* @author seatle <seatle@foxmail.com>
* @created time :2018-01-03
*/
class cls_redis_client
{
private $redis_socket = false;
//private $command = '';
public function __construct($host='127.0.0.1', $port=6379, $timeout = 3)
{
$this->redis_socket = stream_socket_client("tcp://".$host.":".$port, $errno, $errstr, $timeout);
if ( !$this->redis_socket )
{
throw new Exception("{$errno} - {$errstr}");
}
}
public function __destruct()
{
fclose($this->redis_socket);
}
public function __call($name, $args)
{
$crlf = "\r\n";
array_unshift($args, $name);
$command = '*' . count($args) . $crlf;
foreach ($args as $arg)
{
$command .= '$' . strlen($arg) . $crlf . $arg . $crlf;
}
//echo $command."\n";
$fwrite = fwrite($this->redis_socket, $command);
if ($fwrite === FALSE || $fwrite <= 0)
{
throw new Exception('Failed to write entire command to stream');
}
return $this->read_response();
}
private function read_response()
{
$reply = trim(fgets($this->redis_socket, 1024));
switch (substr($reply, 0, 1))
{
case '-':
throw new Exception(trim(substr($reply, 1)));
break;
case '+':
$response = substr(trim($reply), 1);
if ($response === 'OK')
{
$response = TRUE;
}
break;
case '$':
$response = NULL;
if ($reply == '$-1')
{
break;
}
$read = 0;
$size = intval(substr($reply, 1));
if ($size > 0)
{
do
{
$block_size = ($size - $read) > 1024 ? 1024 : ($size - $read);
$r = fread($this->redis_socket, $block_size);
if ($r === FALSE)
{
throw new Exception('Failed to read response from stream');
}
else
{
$read += strlen($r);
$response .= $r;
}
}
while ($read < $size);
}
fread($this->redis_socket, 2); /* discard crlf */
break;
/* Multi-bulk reply */
case '*':
$count = intval(substr($reply, 1));
if ($count == '-1')
{
return NULL;
}
$response = array();
for ($i = 0; $i < $count; $i++)
{
$response[] = $this->read_response();
}
break;
/* Integer reply */
case ':':
$response = intval(substr(trim($reply), 1));
break;
default:
throw new RedisException("Unknown response: {$reply}");
break;
}
return $response;
}
}
//$redis = new cls_redis_client();
//var_dump($redis->auth("foobared"));
//var_dump($redis->set("name",'abc'));
//var_dump($redis->get("name"));

View File

@ -0,0 +1,179 @@
<?php
ini_set("memory_limit", "128M");
/**
* redis 服务端
* 多进程阻塞式
* redis-benchmark -h 127.0.0.1 -p 11211 -t set -n 80000 -q
*
* @version 2.7.0
* @copyright 1997-2018 The PHP Group
* @author seatle <seatle@foxmail.com>
* @created time :2018-01-03
*/
class cls_redis_server
{
private $socket = false;
private $process_num = 3;
public $redis_kv_data = array();
public $onMessage = null;
public function __construct($host="0.0.0.0", $port=6379)
{
$this->socket = stream_socket_server("tcp://".$host.":".$port,$errno, $errstr);
if (!$this->socket) die($errstr."--".$errno);
echo "listen $host $port \r\n";
}
private function parse_resp(&$conn)
{
// 读取一行,遇到 \r\n 为一行
$line = fgets($conn);
if($line === '' || $line === false)
{
return null;
}
// 获取第一个字符作为类型
$type = $line[0];
// 去掉第一个字符,去掉结尾的 \r\n
$line = mb_substr($line, 1, -2);
switch ( $type )
{
case "*":
// 得到长度
$count = (int) $line;
$data = array();
for ($i = 1; $i <= $count; $i++)
{
$data[] = $this->parse_resp($conn);
}
return $data;
case "$":
if ($line == '-1')
{
return null;
}
// 截取的长度要加上 \r\n 两个字符
$length = $line + 2;
$data = '';
while ($length > 0)
{
$block = fread($conn, $length);
if ($length !== strlen($block))
{
throw new Exception('RECEIVING');
}
$data .= $block;
$length -= mb_strlen($block);
}
return mb_substr($data, 0, -2);
}
return $line;
}
private function start_worker_process()
{
$pid = pcntl_fork();
switch ($pid)
{
case -1:
echo "fork error : {$i} \r\n";
exit;
case 0:
while ( true )
{
echo "PID ".posix_getpid()." waiting...\n";
// 堵塞等待
$conn = stream_socket_accept($this->socket, -1);
if ( !$conn )
{
continue;
}
//"*3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$7\r\nmyvalue\r\n"
while( true )
{
$arr = $this->parse_resp($conn);
if ( is_array($arr) )
{
if ($this->onMessage)
{
call_user_func($this->onMessage, $conn, $arr);
}
}
else if ( $arr )
{
if ($this->onMessage)
{
call_user_func($this->onMessage, $conn, $arr);
}
}
else
{
fclose($conn);
break;
}
}
}
default:
$this->pids[$pid] = $pid;
break;
}
}
public function run()
{
for($i = 1; $i <= $this->process_num; $i++)
{
$this->start_worker_process();
}
while( true )
{
foreach ($this->pids as $i => $pid)
{
if($pid)
{
$res = pcntl_waitpid($pid, $status,WNOHANG);
if ( $res == -1 || $res > 0 )
{
$this->start_worker_process();
unset($this->pids[$pid]);
}
}
}
sleep(1);
}
}
}
$server = new cls_redis_server();
$server->onMessage = function($conn, $info) use($server)
{
if ( is_array($info) )
{
$command = strtoupper($info[0]);
if ( $command == "SET" )
{
$key = $info[1];
$val = $info[2];
$server->redis_kv_data[$key] = $val;
fwrite($conn, "+OK\r\n");
}
else if ( $command == "GET" )
{
$key = $info[1];
$val = isset($server->redis_kv_data[$key]) ? $server->redis_kv_data[$key] : '';
fwrite($conn, "$".strlen($val)."\r\n".$val."\r\n");
}
else
{
fwrite($conn,"+OK\r\n");
}
}
else
{
fwrite($conn,"+OK\r\n");
}
};
$server->run();

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,466 @@
<?php
/**
* Curl操作类
*
* Licensed under The MIT License
* For full copyright and license information, please see the MIT-LICENSE.txt
* Redistributions of files must retain the above copyright notice.
*
* @author seatle<seatle@foxmail.com>
* @copyright seatle<seatle@foxmail.com>
* @link http://www.epooll.com/
* @license http://www.opensource.org/licenses/mit-license.php MIT License
*/
class rolling_curl
{
/**
* @var float
*
* 同时运行任务数
* 例如有8个请求则会被分成两批第一批5个请求第二批3个请求
* 注意采集知乎的时候5个是比较稳定的7个以上就开始会超时了多进程就没有这样的问题因为多进程很少几率会发生并发
*/
public $window_size = 5;
/**
* @var float
*
* Timeout is the timeout used for curl_multi_select.
*/
private $timeout = 10;
/**
* @var string|array
*
* 应用在每个请求的回调函数
*/
public $callback;
/**
* @var array
*
* 设置默认的请求参数
*/
protected $options = array(
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_RETURNTRANSFER => 1,
// 注意TIMEOUT = CONNECTTIMEOUT + 数据获取时间,所以 TIMEOUT 一定要大于 CONNECTTIMEOUT否则 CONNECTTIMEOUT 设置了就没意义
// "Connection timed out after 30001 milliseconds"
CURLOPT_CONNECTTIMEOUT => 30,
CURLOPT_TIMEOUT => 60,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_HEADER => 0,
// 在多线程处理场景下使用超时选项时会忽略signals对应的处理函数但是无耐的是还有小概率的crash情况发生
CURLOPT_NOSIGNAL => 1,
CURLOPT_USERAGENT => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36",
);
/**
* @var array
*/
private $headers = array();
/**
* @var Request[]
*
* 请求队列
*/
private $requests = array();
/**
* @var RequestMap[]
*
* Maps handles to request indexes
*/
private $requestMap = array();
public function __construct()
{
}
/**
* set timeout
*
* @param init $timeout
* @return
*/
public function set_timeout($timeout)
{
$this->options[CURLOPT_TIMEOUT] = $timeout;
}
/**
* set proxy
*
*/
public function set_proxy($proxy)
{
$this->options[CURLOPT_PROXY] = $proxy;
}
/**
* set referer
*
*/
public function set_referer($referer)
{
$this->options[CURLOPT_REFERER] = $referer;
}
/**
* 设置 user_agent
*
* @param string $useragent
* @return void
*/
public function set_useragent($useragent)
{
$this->options[CURLOPT_USERAGENT] = $useragent;
}
/**
* 设置COOKIE
*
* @param string $cookie
* @return void
*/
public function set_cookie($cookie)
{
$this->options[CURLOPT_COOKIE] = $cookie;
}
/**
* 设置COOKIE JAR
*
* @param string $cookie_jar
* @return void
*/
public function set_cookiejar($cookiejar)
{
$this->options[CURLOPT_COOKIEJAR] = $cookiejar;
}
/**
* 设置COOKIE FILE
*
* @param string $cookie_file
* @return void
*/
public function set_cookiefile($cookiefile)
{
$this->options[CURLOPT_COOKIEFILE] = $cookiefile;
}
/**
* 获取内容的时候是不是连header也一起获取
*
* @param mixed $http_raw
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public function set_http_raw($http_raw = false)
{
$this->options[CURLOPT_HEADER] = $http_raw;
}
/**
* 设置IP
*
* @param string $ip
* @return void
*/
public function set_ip($ip)
{
$headers = array(
'CLIENT-IP'=>$ip,
'X-FORWARDED-FOR'=>$ip,
);
$this->headers = $this->headers + $headers;
}
/**
* 设置Headers
*
* @param string $headers
* @return void
*/
public function set_headers($headers)
{
$this->headers = $this->headers + $headers;
}
/**
* 设置Hosts
*
* @param string $hosts
* @return void
*/
public function set_hosts($hosts)
{
$headers = array(
'Host'=>$hosts,
);
$this->headers = $this->headers + $headers;
}
/**
* 设置Gzip
*
* @param string $hosts
* @return void
*/
public function set_gzip($gzip)
{
if ($gzip)
{
$this->options[CURLOPT_ENCODING] = 'gzip';
}
}
public function request($url, $method = "GET", $fields = array(), $headers = array(), $options = array())
{
$this->requests[] = array('url'=>$url,'method'=>$method,'fields'=>$fields,'headers'=>$headers,'options'=>$options);
return true;
}
public function get_options($request)
{
$options = $this->options;
$headers = $this->headers;
if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode'))
{
$options[CURLOPT_FOLLOWLOCATION] = 1;
$options[CURLOPT_MAXREDIRS] = 5;
}
// 如果是 get 方式,直接拼凑一个 url 出来
if (strtolower($request['method']) == 'get' && !empty($request['fields']))
{
$url = $request['url'] . "?" . http_build_query($request['fields']);
}
// 如果是 post 方式
if (strtolower($request['method']) == 'post')
{
$options[CURLOPT_POST] = 1;
$options[CURLOPT_POSTFIELDS] = $request['fields'];
}
// append custom options for this specific request
if ($request['options'])
{
$options = $request['options'] + $options;
}
if ($request['headers'])
{
$headers = $request['headers'] + $headers;
}
// 随机绑定 hosts做负载均衡
//if (self::$hosts)
//{
//$parse_url = parse_url($url);
//$host = $parse_url['host'];
//$key = rand(0, count(self::$hosts)-1);
//$ip = self::$hosts[$key];
//$url = str_replace($host, $ip, $url);
//self::$headers = array_merge( array('Host:'.$host), self::$headers );
//}
// header 要这样拼凑
$headers_tmp = array();
foreach ($headers as $k=>$v)
{
$headers_tmp[] = $k.":".$v;
}
$headers = $headers_tmp;
$options[CURLOPT_URL] = $request['url'];
$options[CURLOPT_HTTPHEADER] = $headers;
return $options;
}
/**
* GET 请求
*
* @param string $url
* @param array $headers
* @param array $options
* @return bool
*/
public function get($url, $fields = array(), $headers = array(), $options = array())
{
return $this->request($url, 'get', $fields, $headers, $options);
}
/**
* $fields 有三种类型:1、数组2、http query3、json
* 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao'))
* 前两种是普通的post可以用$_POST方式获取
* 第三种是post stream( json rpc其实就是webservice )虽然是post方式但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
*
* @param string $url
* @param array $fields
* @param array $headers
* @param array $options
* @return void
*/
public function post($url, $fields = array(), $headers = array(), $options = array())
{
return $this->request($url, 'post', $fields, $headers, $options);
}
/**
* Execute processing
*
* @param int $window_size Max number of simultaneous connections
* @return string|bool
*/
public function execute($window_size = null)
{
$count = sizeof($this->requests);
if ($count == 0)
{
return false;
}
// 只有一个请求
elseif ($count == 1)
{
return $this->single_curl();
}
else
{
// 开始 rolling curlwindow_size 是最大同时连接数
return $this->rolling_curl($window_size);
}
}
private function single_curl()
{
$ch = curl_init();
// 从请求队列里面弹出一个来
$request = array_shift($this->requests);
$options = $this->get_options($request);
curl_setopt_array($ch, $options);
$output = curl_exec($ch);
$info = curl_getinfo($ch);
$error = null;
if ($output === false)
{
$error = curl_error( $ch );
}
//$output = substr($output, 10);
//$output = gzinflate($output);
// 其实一个请求的时候没是么必要回调,直接返回数据就好了,不过这里算是多一个功能吧,和多请求保持一样的操作
if ($this->callback)
{
if (is_callable($this->callback))
{
call_user_func($this->callback, $output, $info, $request, $error);
}
}
else
{
return $output;
}
return true;
}
private function rolling_curl($window_size = null)
{
// 如何设置了最大任务数
if ($window_size)
$this->window_size = $window_size;
// 如果请求数 小于 任务数,设置任务数为请求数
if (sizeof($this->requests) < $this->window_size)
$this->window_size = sizeof($this->requests);
// 如果任务数小于2个不应该用这个方法的用上面的single_curl方法就好了
if ($this->window_size < 2)
exit("Window size must be greater than 1");
// 初始化任务队列
$master = curl_multi_init();
// 开始第一批请求
for ($i = 0; $i < $this->window_size; $i++)
{
$ch = curl_init();
$options = $this->get_options($this->requests[$i]);
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
// 添加到请求数组
$key = (string) $ch;
$this->requestMap[$key] = $i;
}
do {
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ;
// 如果
if ($execrun != CURLM_OK) { break; }
// 一旦有一个请求完成找出来因为curl底层是select所以最大受限于1024
while ($done = curl_multi_info_read($master))
{
// 从请求中获取信息、内容、错误
$info = curl_getinfo($done['handle']);
$output = curl_multi_getcontent($done['handle']);
$error = curl_error($done['handle']);
// 如果绑定了回调函数
$callback = $this->callback;
if (is_callable($callback))
{
$key = (string) $done['handle'];
$request = $this->requests[$this->requestMap[$key]];
unset($this->requestMap[$key]);
call_user_func($callback, $output, $info, $request, $error);
}
// 一个请求完了就加一个进来一直保证5个任务同时进行
if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests))
{
$ch = curl_init();
$options = $this->get_options($this->requests[$i]);
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
// 添加到请求数组
$key = (string) $ch;
$this->requestMap[$key] = $i;
$i++;
}
// 把请求已经完成了得 curl handle 删除
curl_multi_remove_handle($master, $done['handle']);
}
// 当没有数据的时候进行堵塞,把 CPU 使用权交出来,避免上面 do 死循环空跑数据导致 CPU 100%
if ($running)
{
curl_multi_select($master, $this->timeout);
}
} while ($running);
// 关闭任务
curl_multi_close($master);
// 把请求清空,否则没有重新 new rolling_curl(); 直接再次导入一批url的时候就会把前面已经执行过的url又执行一轮
unset($this->requests);
return true;
}
/**
* @return void
*/
public function __destruct()
{
unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests);
}
}

7
vendor/owner888/phpspider/test.php vendored Normal file
View File

@ -0,0 +1,7 @@
<?php
$arr = array('fff', 'ggg', '', '');
$arr = array_filter($arr);
print_r($arr);

32
vendor/owner888/phpspider/worker.php vendored Normal file
View File

@ -0,0 +1,32 @@
<?php
echo "Starting\n";
$gmworker = new GearmanWorker();
$gmworker->addServer('10.10.10.238');
$gmworker->addFunction("reverse", "reverse_fn");
print "Waiting for job...\n";
while($gmworker->work())
{
if ($gmworker->returnCode() != GEARMAN_SUCCESS)
{
echo "return_code: " . $gmworker->returnCode() . "\n";
break;
}
//break;
}
function reverse_fn($job)
{
sleep(3);
echo $job->workload()."\n";
return strrev($job->workload());
}
echo "hello\n";
?>