2020-01-14 19:17:29 +08:00

3599 lines
116 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
// +----------------------------------------------------------------------
// | PHPSpider [ A PHP Framework For Crawler ]
// +----------------------------------------------------------------------
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
// +----------------------------------------------------------------------
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
// +----------------------------------------------------------------------
// | Author: Seatle Yang <seatle@foxmail.com>
// +----------------------------------------------------------------------
//----------------------------------
// PHPSpider核心类文件
// ***********
// 泛域名抓取优化版 BY KEN a-site@foxmail.com
// ***********
// * 泛域名设置domain = array('*')
// * 增加子域名数量限制 $max_sub_num = 100
//----------------------------------
namespace phpspider\core;
require_once __DIR__.'/constants.php';
use Exception;
use phpspider\core\db;
use phpspider\core\log;
use phpspider\core\queue;
use phpspider\core\requests;
use phpspider\core\selector;
use phpspider\core\util;
// 启动的时候生成data目录
util::path_exists(PATH_DATA);
util::path_exists(PATH_DATA.'/lock');
util::path_exists(PATH_DATA.'/log');
util::path_exists(PATH_DATA.'/cache');
util::path_exists(PATH_DATA.'/status');
class phpspider
{
/**
* 版本号
* @var string
*/
const VERSION = '2.1.5';
/**
* 爬虫爬取每个网页的时间间隔,0表示不延时, 单位: 毫秒
*/
const INTERVAL = 100;
/**
* 爬虫爬取每个网页的超时时间, 单位: 秒
*/
const TIMEOUT = 5;
/**
* 爬取失败次数, 不想失败重新爬取则设置为0
*/
const MAX_TRY = 0;
/**
* 爬虫爬取网页所使用的浏览器类型: pc/Mac、ios、android
* 默认类型是PC
*/
const AGENT_PC = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36';
const AGENT_IOS = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_3 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13G34 Safari/601.1';
const AGENT_ANDROID = 'Mozilla/5.0 (Linux; U; Android 6.0.1;zh_cn; Le X820 Build/FEXCNFN5801507014S) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/49.0.0.0 Mobile Safari/537.36 EUI Browser/5.8.015S';
/**
* pid文件的路径及名称
* @var string
*/
//public static $pid_file = '';
/**
* 日志目录, 默认在data根目录下
* @var mixed
*/
//public static $log_file = '';
/**
* 主任务进程ID
*/
//public static $master_pid = 0;
/**
* 所有任务进程ID
*/
//public static $taskpids = array();
/**
* Daemonize.
*
* @var bool
*/
public static $daemonize = false;
/**
* 当前进程是否终止
*/
public static $terminate = false;
/**
* 是否分布式
*/
public static $multiserver = false;
/**
* 当前服务器ID
*/
public static $serverid = 1;
/**
* 主任务进程
*/
public static $taskmaster = true;
/**
* 当前任务ID
*/
public static $taskid = 1;
/**
* 当前任务进程ID
*/
public static $taskpid = 1;
/**
* 并发任务数
*/
public static $tasknum = 1;
/**
* 生成
*/
public static $fork_task_complete = false;
/**
* 是否使用Redis
*/
public static $use_redis = false;
/**
* 是否保存爬虫运行状态
*/
public static $save_running_state = false;
/**
* 配置
*/
public static $configs = array();
/**
* 要抓取的URL队列
md5(url) => array(
'url' => '', // 要爬取的URL
'url_type' => '', // 要爬取的URL类型,scan_page、list_page、content_page
'method' => 'get', // 默认为"GET"请求, 也支持"POST"请求
'headers' => array(), // 此url的Headers, 可以为空
'params' => array(), // 发送请求时需添加的参数, 可以为空
'context_data'=> '', // 此url附加的数据, 可以为空
'proxy' => false, // 是否使用代理
'try_num' => 0 // 抓取次数
'max_try' => 0 // 允许抓取失败次数
)
*/
public static $collect_queue = array();
/**
* 要抓取的URL数组
* md5($url) => time()
*/
public static $collect_urls = array();
/**
* 要抓取的URL数量
*/
public static $collect_urls_num = 0;
/**
* 已经抓取的URL数量
*/
public static $collected_urls_num = 0;
/**
* 当前进程采集成功数
*/
public static $collect_succ = 0;
/**
* 当前进程采集失败数
*/
public static $collect_fail = 0;
/**
* 提取到的字段数
*/
public static $fields_num = 0;
/**
* 【KEN】提取到的页面数按域名计数容器 结构为 domain => number
*/
public static $pages_num = array();
/**
* 【KEN】单域名允许抓取的最大页面数,0为不限制
*/
public static $max_pages = 0;
/**
* 【KEN】花费的抓取时长计数容器 结构为 domain => number
*/
public static $duration = array();
/**
* 【KEN】单域名允许抓取的最大时长单位秒,0为不限制
*/
public static $max_duration = 0;
/**
* 【KEN】单域名最大子域名发现数量 防止掉进蜘蛛池推荐值3000多数大型网站上限
*/
public static $max_sub_num = 3000; //建议值 3000
/**
* 【KEN】子进程未获取任务超时退出前等待计时器
*/
public static $stand_by_time = 0;
/**
* 【KEN】子进程未获取任务超时退出前最大等待时长/秒,全部任务束后,子进程将会等待的时间,以便有缓冲时间,获得新的任务
*/
public static $max_stand_by_time = 60; //建议值 60
/**
* 【KEN】每个主机并发上限降低对方网站流量压力和减少被阻挡概率建议值 6 ,须与 queue_order = rand 一起使用
*/
public static $max_task_per_host = 0; //0值和非0值会使用不同类型的队列缓存库从0改为非0值或从非0值改为0需清空队列缓存库再运行否则任务无法添加
public static $task_per_host_counter = array(); //计数容器
/**
* 采集深度
*/
public static $depth_num = 0;
/**
* 爬虫开始时间
*/
public static $time_start = 0;
/**
* 任务状态
*/
public static $task_status = array();
// 导出类型配置
public static $export_type = '';
public static $export_file = '';
public static $export_conf = '';
public static $export_table = '';
// 数据库配置
public static $db_config = array();
// 队列配置
public static $queue_config = array();
// 运行面板参数长度
public static $server_length = 10;
public static $tasknum_length = 8;
public static $taskid_length = 8;
public static $pid_length = 8;
public static $mem_length = 8;
public static $urls_length = 15;
public static $speed_length = 6;
/**
* 爬虫初始化时调用, 用来指定一些爬取前的操作
*
* @var mixed
* @access public
*/
public $on_start = null;
/**
* URL采集前调用
* 比如有时需要根据某个特定的URL来决定这次的请求是否使用代理 / 或使用哪个代理
*
* @var mixed
* @access public
*/
public $on_before_download_page = null;
/**
* 网页状态码回调
*
* @var mixed
* @access public
*/
public $on_status_code = null;
/**
* 判断当前网页是否被反爬虫, 需要开发者实现
*
* @var mixed
* @access public
*/
public $is_anti_spider = null;
/**
* 在一个网页下载完成之后调用, 主要用来对下载的网页进行处理
*
* @var mixed
* @access public
*/
public $on_download_page = null;
/**
* 在一个attached_url对应的网页下载完成之后调用. 主要用来对下载的网页进行处理
*
* @var mixed
* @access public
*/
public $on_download_attached_page = null;
/**
* 当前页面抽取到URL
*
* @var mixed
* @access public
*/
public $on_fetch_url = null;
/**
* URL属于入口页
* 在爬取到入口url的内容之后, 添加新的url到待爬队列之前调用
* 主要用来发现新的待爬url, 并且能给新发现的url附加数据
*
* @var mixed
* @access public
*/
public $on_scan_page = null;
/**
* URL属于列表页
* 在爬取到列表页url的内容之后, 添加新的url到待爬队列之前调用
* 主要用来发现新的待爬url, 并且能给新发现的url附加数据
*
* @var mixed
* @access public
*/
public $on_list_page = null;
/**
* URL属于内容页
* 在爬取到内容页url的内容之后, 添加新的url到待爬队列之前调用
* 主要用来发现新的待爬url, 并且能给新发现的url附加数据
*
* @var mixed
* @access public
*/
public $on_content_page = null;
/**
* 在抽取到field内容之后调用, 对其中包含的img标签进行回调处理
*
* @var mixed
* @access public
*/
public $on_handle_img = null;
/**
* 当一个field的内容被抽取到后进行的回调, 在此回调中可以对网页中抽取的内容作进一步处理
*
* @var mixed
* @access public
*/
public $on_extract_field = null;
/**
* 在一个网页的所有field抽取完成之后, 可能需要对field进一步处理, 以发布到自己的网站
*
* @var mixed
* @access public
*/
public $on_extract_page = null;
/**
* 如果抓取的页面是一个附件文件, 比如图片、视频、二进制文件、apk、ipad、exe
* 就不去分析他的内容提取field了, 提取field只针对HTML
*
* @var mixed
* @access public
*/
public $on_attachment_file = null;
public function __construct($configs = array())
{
// 产生时钟云解决php7下面ctrl+c无法停止bug
declare(ticks = 1);
// 先打开以显示验证报错内容
log::$log_show = true;
log::$log_file = isset($configs['log_file']) ? $configs['log_file'] : PATH_DATA.'/phpspider.log';
log::$log_type = isset($configs['log_type']) ? $configs['log_type'] : false;
// 彩蛋
$included_files = get_included_files();
$content = file_get_contents($included_files[0]);
if (!preg_match("#/\* Do NOT delete this comment \*/#", $content) || !preg_match("#/\* 不要删除这段注释 \*/#", $content))
{
$msg = "Unknown error...";
log::error($msg);
exit;
}
$configs['name'] = isset($configs['name']) ? $configs['name'] : 'phpspider';
$configs['proxy'] = isset($configs['proxy']) ? $configs['proxy'] : false;
$configs['user_agent'] = isset($configs['user_agent']) ? $configs['user_agent'] : self::AGENT_PC;
$configs['client_ip'] = isset($configs['client_ip']) ? $configs['client_ip'] : array();
$configs['interval'] = isset($configs['interval']) ? $configs['interval'] : self::INTERVAL;
$configs['timeout'] = isset($configs['timeout']) ? $configs['timeout'] : self::TIMEOUT;
$configs['max_try'] = isset($configs['max_try']) ? $configs['max_try'] : self::MAX_TRY;
$configs['max_depth'] = isset($configs['max_depth']) ? $configs['max_depth'] : 0;
$configs['max_fields'] = isset($configs['max_fields']) ? $configs['max_fields'] : 0;
$configs['export'] = isset($configs['export']) ? $configs['export'] : array();
//新增参数 BY KEN <a-site@foxmail.com>
$configs['max_pages'] = isset($configs['max_pages']) ? $configs['max_pages'] : self::$max_pages;
$configs['max_duration'] = isset($configs['max_duration']) ? $configs['max_duration'] : self::$max_duration;
$configs['max_sub_num'] = isset($configs['max_sub_num']) ? $configs['max_sub_num'] : self::$max_sub_num;
$configs['max_stand_by_time'] = isset($configs['max_stand_by_time']) ? $configs['max_stand_by_time'] : self::$max_stand_by_time;
$configs['max_task_per_host'] = isset($configs['max_task_per_host']) ? $configs['max_task_per_host'] : self::$max_task_per_host;
//启用 host并发上限时队列参数强制为随机
if ($configs['max_task_per_host'] > 0)
{
$configs['queue_order'] = 'rand';
}
else
{
$configs['queue_order'] = isset($configs['queue_order']) ? $configs['queue_order'] : 'list';
}
// csv、sql、db
self::$export_type = isset($configs['export']['type']) ? $configs['export']['type'] : '';
self::$export_file = isset($configs['export']['file']) ? $configs['export']['file'] : '';
self::$export_table = isset($configs['export']['table']) ? $configs['export']['table'] : '';
self::$db_config = isset($configs['db_config']) ? $configs['db_config'] : array();
self::$queue_config = isset($configs['queue_config']) ? $configs['queue_config'] : array();
// 是否设置了并发任务数, 并且大于1, 而且不是windows环境
if (isset($configs['tasknum']) && $configs['tasknum'] > 1 && !util::is_win())
{
self::$tasknum = $configs['tasknum'];
}
// 是否设置了保留运行状态
if (isset($configs['save_running_state']))
{
self::$save_running_state = $configs['save_running_state'];
}
// 是否分布式
if (isset($configs['multiserver']))
{
self::$multiserver = $configs['multiserver'];
}
// 当前服务器ID
if (isset($configs['serverid']))
{
self::$serverid = $configs['serverid'];
}
// 不同项目的采集以采集名称作为前缀区分 缩短 spider name md5长度到4位减少内存占用
if (isset(self::$queue_config['prefix']))
{
self::$queue_config['prefix'] = self::$queue_config['prefix'].'-'.substr(md5($configs['name']), 0, 4);
}
self::$configs = $configs;
}
public function get_config($name)
{
return empty(self::$configs[$name]) ? array() : self::$configs[$name];
}
public function add_scan_url($url, $options = array(), $allowed_repeat = true)
{
// 投递状态
$status = false;
//限制最大子域名数量
if ( ! empty(self::$configs['max_sub_num']))
{
//抓取到的子域名超过指定数量,就丢掉此域名
$sub_domain_count = $this->sub_domain_count($url);
if ($sub_domain_count > self::$configs['max_sub_num'])
{
log::debug('Task('.self::$taskid.') subdomin = '.$sub_domain_count.' more than '.self::$configs['max_sub_num'].",add_scan_url $url [Skip]");
return $status;
}
}
$link = $options;
$link['url'] = $url;
$link['url_type'] = 'scan_page';
$link = $this->link_uncompress($link);
if ($this->is_content_page($url))
{
$link['url_type'] = 'content_page';
$status = $this->queue_lpush($link, $allowed_repeat);
}
elseif ($this->is_list_page($url))
{
$link['url_type'] = 'list_page';
$status = $this->queue_lpush($link, $allowed_repeat);
}
else
{
$status = $this->queue_lpush($link, $allowed_repeat);
}
if ($status)
{
if ($link['url_type'] == 'scan_page')
{
log::debug("Find scan page: {$url}");
}
elseif ($link['url_type'] == 'content_page')
{
log::debug("Find content page: {$url}");
}
elseif ($link['url_type'] == 'list_page')
{
log::debug("Find list page: {$url}");
}
}
return $status;
}
/**
* 一般在 on_scan_page 和 on_list_page 回调函数中调用, 用来往待爬队列中添加url
* 两个进程同时调用这个方法, 传递相同url的时候, 就会出现url重复进入队列
*
* @param mixed $url
* @param mixed $options
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public function add_url($url, $options = array(), $depth = 0)
{
// 投递状态
$status = false;
//限制最大子域名数量
if ( ! empty(self::$configs['max_sub_num']))
{
//抓取超过 max_sub_num 子域名的,就丢掉
$sub_domain_count = $this->sub_domain_count($url);
if ($sub_domain_count > self::$configs['max_sub_num'])
{
log::debug('Task('.self::$taskid.') subdomin = '.$sub_domain_count.' more than '.self::$configs['max_sub_num'].",add_url $url [Skip]");
//echo '[on_download_page] ' . $domain . "'s subdomin > 1000 ,Skip!\n";
return $status;
}
}
$link = $options;
$link['url'] = $url;
$link['depth'] = $depth;
$link = $this->link_uncompress($link);
if ($this->is_content_page($url))
{
$link['url_type'] = 'content_page';
$status = $this->queue_lpush($link);
}
elseif ($this->is_list_page($url))
{
$link['url_type'] = 'list_page';
$status = $this->queue_lpush($link);
}
if ($status)
{
if ($link['url_type'] == 'scan_page')
{
log::debug("Find scan page: {$url}");
}
elseif ($link['url_type'] == 'content_page')
{
log::debug("Find content page: {$url}");
}
elseif ($link['url_type'] == 'list_page')
{
log::debug("Find list page: {$url}");
}
}
return $status;
}
/**
* 是否入口页面
*
* @param mixed $url
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-12 19:06
*/
public function is_scan_page($url)
{
$parse_url = parse_url($url);
//2018-1-3 通配所有域名
if ( ! empty($parse_url['host']) and self::$configs['domains'][0] == '*')
{
return true;
}
//限定域名
if (empty($parse_url['host']) || ! in_array($parse_url['host'], self::$configs['domains']))
{
return false;
}
return true;
}
/**
* 是否列表页面
*
* @param mixed $url
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-12 19:06
*/
public function is_list_page($url)
{
$result = false;
//过滤下载类型文件 20180209
if (preg_match('/\.(zip|7z|cab|rar|iso|gho|jar|ace|tar|gz|bz2|z|xml|pdf|doc|txt|rtf|snd|xls|xlsx|docx|apk|ipa|flv|midi|mps|pls|pps|ppa|pwz|mp3|mp4|mpeg|mpe|asf|asx|mpg|3gp|mov|m4v|mkv|vob|vod|mod|ogg|rm|rmvb|wmv|avi|dat|exe|wps|js|css|bmp|jpg|png|gif|ico|tiff|jpeg|svg|webp|mpa|mdb|bin)$/iu', $url))
{
return false;
}
//增加 要排除的列表页特征正则 BY KEN <a-site@foxmail.com>
if ( ! empty(self::$configs['list_url_regexes_remove']))
{
foreach (self::$configs['list_url_regexes_remove'] as $regex)
{
if (preg_match("#{$regex}#i", $url))
{
return false;
}
}
}
//增加无列表页选项,即所有页面都要抓取内容,包含列表页
if (empty(self::$configs['list_url_regexes']) or self::$configs['list_url_regexes'][0] == 'x')
{
return false;
}
//增加泛列表页,即所有页面都是列表页,只抓取链接,不抓取内容
if (self::$configs['list_url_regexes'][0] == '*')
{
return true;
}
if ( ! empty(self::$configs['list_url_regexes']))
{
foreach (self::$configs['list_url_regexes'] as $regex)
{
if (preg_match("#{$regex}#i", $url))
{
$result = true;
break;
}
}
}
return $result;
}
/**
* 是否内容页面
*
* @param mixed $url
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-12 19:06
*/
public function is_content_page($url)
{
$result = false;
//过滤下载类型文件 20180209
if (preg_match('/\.(zip|7z|cab|rar|iso|gho|jar|ace|tar|gz|bz2|z|xml|pdf|doc|txt|rtf|snd|xls|xlsx|docx|apk|ipa|flv|midi|mps|pls|pps|ppa|pwz|mp3|mp4|mpeg|mpe|asf|asx|mpg|3gp|mov|m4v|mkv|vob|vod|mod|ogg|rm|rmvb|wmv|avi|dat|exe|wps|js|css|bmp|jpg|png|gif|ico|tiff|jpeg|svg|webp|mpa|mdb|bin)$/iu', $url))
{
return false;
}
//增加 要排除的内容页特征正则 BY KEN <a-site@foxmail.com>
if ( ! empty(self::$configs['content_url_regexes_remove']))
{
foreach (self::$configs['content_url_regexes_remove'] as $regex)
{
if (preg_match("#{$regex}#i", $url))
{
return false;
}
}
}
//增加泛内容模式,即所有页面都要提取内容
if (empty(self::$configs['content_url_regexes']) or self::$configs['content_url_regexes'][0] == '*')
{
return true;
}
//无内容,泛列表模式,即所有页面都不提取内容
if (self::$configs['content_url_regexes'][0] == 'x')
{
return false;
}
if ( ! empty(self::$configs['content_url_regexes']))
{
foreach (self::$configs['content_url_regexes'] as $regex)
{
if (preg_match("#{$regex}#i", $url))
{
$result = true;
break;
}
}
}
return $result;
}
/**
* Parse command.
* php yourfile.php start | stop | status | kill
*
* @return void
*/
public function parse_command()
{
// 检查运行命令的参数
global $argv;
$start_file = $argv[0];
// 命令
$command = isset($argv[1]) ? trim($argv[1]) : 'start';
// 子命令, 目前只支持-d
$command2 = isset($argv[2]) ? $argv[2] : '';
// 根据命令做相应处理
switch($command)
{
// 启动 phpspider
case 'start':
if ($command2 === '-d')
{
self::$daemonize = true;
}
break;
case 'stop':
exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}'", $info);
if (count($info) <= 1)
{
echo "PHPSpider[$start_file] not run\n";
}
else
{
//echo "PHPSpider[$start_file] is stoping ...\n";
echo "PHPSpider[$start_file] stop success";
exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}' |xargs kill -SIGINT", $info);
}
exit;
break;
case 'kill':
exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}' |xargs kill -SIGKILL");
break;
// 显示 phpspider 运行状态
case 'status':
exit(0);
// 未知命令
default :
exit("Usage: php yourfile.php {start|stop|status|kill}\n");
}
}
/**
* Signal hander.
*
* @param int $signal
*/
public function signal_handler($signal)
{
switch ($signal)
{
// Stop.
case SIGINT:
log::warn('Program stopping...');
self::$terminate = true;
break;
// Show status.
case SIGUSR2:
echo "show status\n";
break;
}
}
/**
* Install signal handler.
*
* @return void
*/
public function install_signal()
{
if (function_exists('pcntl_signal'))
{
// stop
// static调用方式
//pcntl_signal(SIGINT, array(__CLASS__, 'signal_handler'), false);
pcntl_signal(SIGINT, array(&$this, 'signal_handler'), false);
// status
pcntl_signal(SIGUSR2, array(&$this, 'signal_handler'), false);
// ignore
pcntl_signal(SIGPIPE, SIG_IGN, false);
}
}
/**
* Run as deamon mode.
*
* @throws Exception
*/
protected static function daemonize()
{
if (!self::$daemonize)
{
return;
}
// fork前一定要关闭redis
queue::clear_link();
umask(0);
$pid = pcntl_fork();
if (-1 === $pid)
{
throw new Exception('fork fail');
}
elseif ($pid > 0)
{
exit(0);
}
if (-1 === posix_setsid())
{
throw new Exception('setsid fail');
}
// Fork again avoid SVR4 system regain the control of terminal.
$pid = pcntl_fork();
if (-1 === $pid)
{
throw new Exception('fork fail');
}
elseif (0 !== $pid)
{
exit(0);
}
}
/**
* 检查是否终止当前进程
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-11-16 11:06
*/
public function check_terminate()
{
if (!self::$terminate)
{
return false;
}
// 删除当前任务状态
$this->del_task_status(self::$serverid, self::$taskid);
if (self::$taskmaster)
{
// 检查子进程是否都退出
while (true)
{
$all_stop = true;
for ($i = 2; $i <= self::$tasknum; $i++)
{
// 只要一个还活着就说明没有完全退出
$task_status = $this->get_task_status(self::$serverid, $i);
if ($task_status)
{
$all_stop = false;
}
}
if ($all_stop)
{
break;
}
else
{
log::warn('Task stop waiting...');
}
sleep(1);
}
$this->del_server_list(self::$serverid);
// 显示最后结果
log::$log_show = true;
$spider_time_run = util::time2second(intval(microtime(true) - self::$time_start));
log::note("Spider finished in {$spider_time_run}");
$get_collected_url_num = $this->get_collected_url_num();
log::note("Total pages: {$get_collected_url_num} \n");
}
exit();
}
public function start()
{
$this->parse_command();
// 爬虫开始时间
self::$time_start = time();
// 当前任务ID
self::$taskid = 1;
// 当前任务进程ID
self::$taskpid = function_exists('posix_getpid') ? posix_getpid() : 1;
self::$collect_succ = 0;
self::$collect_fail = 0;
//--------------------------------------------------------------------------------
// 运行前验证
//--------------------------------------------------------------------------------
// 检查PHP版本
if (version_compare(PHP_VERSION, '5.3.0', 'lt'))
{
log::error('PHP 5.3+ is required, currently installed version is: ' . phpversion());
exit;
}
// 检查CURL扩展
if(!function_exists('curl_init'))
{
log::error('The curl extension was not found');
exit;
}
// 多任务需要pcntl扩展支持
if (self::$tasknum > 1 && !function_exists('pcntl_fork'))
{
log::error('Multitasking needs pcntl, the pcntl extension was not found');
exit;
}
// 守护进程需要pcntl扩展支持
if (self::$daemonize && !function_exists('pcntl_fork'))
{
log::error('Daemonize needs pcntl, the pcntl extension was not found');
exit;
}
// 集群、保存运行状态、多任务都需要Redis支持
if ( self::$multiserver || self::$save_running_state || self::$tasknum > 1 )
{
self::$use_redis = true;
queue::set_connect('default', self::$queue_config);
if (!queue::init())
{
if ( self::$multiserver )
{
log::error('Multiserver needs Redis support, '.queue::$error);
exit;
}
if ( self::$tasknum > 1 )
{
log::error('Multitasking needs Redis support, '.queue::$error);
exit;
}
if ( self::$save_running_state )
{
log::error('Spider kept running state needs Redis support, '.queue::$error);
exit;
}
}
}
// 检查导出
$this->check_export();
// 检查缓存
$this->check_cache();
// 检查 scan_urls
if (empty(self::$configs['scan_urls']))
{
log::error('No scan url to start');
exit;
}
foreach ( self::$configs['scan_urls'] as $url )
{
// 只检查配置中的入口URL, 通过 add_scan_url 添加的不检查了.
if (!$this->is_scan_page($url))
{
log::error("Domain of scan_urls (\"{$url}\") does not match the domains of the domain name");
exit;
}
}
// windows 下没法显示面板, 强制显示日志
if (util::is_win())
{
self::$configs['name'] = iconv('UTF-8', 'GB2312//IGNORE', self::$configs['name']);
log::$log_show = true;
}
// 守护进程下也显示日志
elseif (self::$daemonize)
{
log::$log_show = true;
}
else
{
log::$log_show = isset(self::$configs['log_show']) ? self::$configs['log_show'] : false;
}
if (log::$log_show)
{
global $argv;
$start_file = $argv[0];
$header = '';
if ( ! util::is_win())
{
$header .= "\033[33m";
}
$header .= "\n[ ".self::$configs['name']." Spider ] is started...\n\n";
$header .= ' * PHPSpider Version: '.self::VERSION."\n";
$header .= " * Documentation: https://doc.phpspider.org\n";
$header .= ' * Task Number: '.self::$tasknum."\n\n";
$header .= "Input \"php $start_file stop\" to quit. Start success.\n";
if ( ! util::is_win())
{
$header .= "\033[0m";
}
log::note($header);
}
// 如果是守护进程,恢复日志状态
//if (self::$daemonize)
//{
//log::$log_show = isset(self::$configs['log_show']) ? self::$configs['log_show'] : false;
//}
// 多任务和分布式都要清掉, 当然分布式只清自己的
$this->init_redis();
//--------------------------------------------------------------------------------
// 生成多任务
//--------------------------------------------------------------------------------
// 添加入口URL到队列
foreach ( self::$configs['scan_urls'] as $url )
{
// false 表示不允许重复
$this->add_scan_url($url, null, false);
}
// 放这个位置, 可以添加入口页面
if ($this->on_start)
{
call_user_func($this->on_start, $this);
}
if (!self::$daemonize)
{
if (!log::$log_show)
{
// 第一次先清屏
$this->clear_echo();
// 先显示一次面板, 然后下面再每次采集成功显示一次
$this->display_ui();
}
}
else
{
$this->daemonize();
}
// 安装信号
$this->install_signal();
// 开始采集
$this->do_collect_page();
// 从服务器列表中删除当前服务器信息
$this->del_server_list(self::$serverid);
}
/**
* 创建一个子进程
* @param Worker $worker
* @throws Exception
*/
public function fork_one_task($taskid)
{
$pid = pcntl_fork();
// 主进程记录子进程pid
if($pid > 0)
{
// 暂时没用
//self::$taskpids[$taskid] = $pid;
}
// 子进程运行
elseif (0 === $pid)
{
log::warn("Fork children task({$taskid}) successful...");
// 初始化子进程参数
self::$time_start = microtime(true);
self::$taskid = $taskid;
self::$taskmaster = false;
self::$taskpid = posix_getpid();
self::$collect_succ = 0;
self::$collect_fail = 0;
queue::set_connect('default', self::$queue_config);
queue::init();
//退出前计时等待1分钟如果获取不到新任务再退出
self::$stand_by_time = 0;
while (self::$stand_by_time < self::$configs['max_stand_by_time'])
{
$this->do_collect_page();
log::warn('Task('.self::$taskid.') Stand By '.self::$stand_by_time.'/'.self::$configs['max_stand_by_time'].' s');
self::$stand_by_time++;
sleep(1);
}
$queue_lsize = $this->queue_lsize();
log::warn('Task('.self::$taskid.') exit : queue_lsize = '.$queue_lsize);
$this->del_task_status(self::$serverid, $taskid);
// 这里用0表示子进程正常退出
exit(0);
}
else
{
log::error("Fork children task({$taskid}) fail...");
exit;
}
}
public function do_collect_page()
{
while( $queue_lsize = $this->queue_lsize() )
{
// 如果是主任务
if (self::$taskmaster)
{
// 多任务下主任务未准备就绪
if (self::$tasknum > 1 && !self::$fork_task_complete)
{
// 主进程采集到多于任务数2个时, 生成子任务一起采集
if ($queue_lsize > self::$tasknum + 2)
{
self::$fork_task_complete = true;
// fork 子进程前一定要先干掉redis连接fd, 不然会存在进程互抢redis fd 问题
queue::clear_link();
// task进程从2开始, 1被master进程所使用
for ($i = 2; $i <= self::$tasknum; $i++)
{
$this->fork_one_task($i);
}
}
}
//在主进程中,保存当前配置到缓存,以使子进程可实时读取动态修改后的配置 20180209
if (self::$use_redis and ! empty(self::$configs))
{
queue::set('configs_'.self::$configs['name'], json_encode(self::$configs));
}
// 抓取页面
$this->collect_page();
// 保存任务状态
$this->set_task_status();
// 每采集成功一次页面, 就刷新一次面板
if (!log::$log_show && !self::$daemonize)
{
$this->display_ui();
}
}
// 如果是子任务
else
{
// 主进程采集到多于任务数2个时, 子任务可以采集, 否则等待...
if ($queue_lsize > self::$taskid + 2)
{
//在子进程中,从内存中实时读取当前最新配置,用于适应主进程常驻内存模式,无限循环后的配置变动 20180209
if (self::$use_redis and ! empty(self::$configs))
{
if ($configs_active = queue::get('configs_'.self::$configs['name']))
{
self::$configs = json_decode($configs_active, true);
}
}
// 抓取页面
$this->collect_page();
// 保存任务状态
$this->set_task_status();
}
else
{
log::warn('Task('.self::$taskid.') waiting...reason: queue_lsize = '.$queue_lsize.' < tasknum = '.self::$tasknum);
sleep(1);
}
}
// 检查进程是否收到关闭信号
$this->check_terminate();
}
}
/**
* 爬取页面
*
* @param mixed $collect_url 要抓取的链接
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public function collect_page()
{
//减少非必要 queue_lsize 查询 20180214
if (isset(self::$configs['log_type']) and strstr(self::$configs['log_type'], 'info'))
{
$get_collect_url_num = $this->get_collect_url_num();
log::info('task id: '.self::$taskid." Find pages: {$get_collect_url_num} ");
$queue_lsize = $this->queue_lsize();
log::info('task id: '.self::$taskid." Waiting for collect pages: {$queue_lsize} ");
$get_collected_url_num = $this->get_collected_url_num();
log::info('task id: '.self::$taskid." Collected pages: {$get_collected_url_num} ");
// 多任务的时候输出爬虫序号
if (self::$tasknum > 1)
{
log::info('Current task id: '.self::$taskid);
}
}
//顺序提取任务,先进先出(当配置 queue_order = rand ,先进先出无效,都为随机提取任务)
$link = $this->queue_rpop();
if (empty($link))
{
log::warn('Task('.self::$taskid.') Get Task link Fail...Stand By...');
return false;
}
$link = $this->link_uncompress($link);
if (empty($link['url']))
{
log::warn('Task('.self::$taskid.') Get Task url Fail...Stand By...');
return false;
}
self::$stand_by_time = 0; //接到任务,则超时退出计时重置
$url = $link['url'];
//限制单域名最大url数量 20180213
if (isset(self::$configs['max_pages']) and self::$configs['max_pages'] > 0)
{
$domain_pages_num = $this->incr_pages_num($url);
if ($domain_pages_num > self::$configs['max_pages'])
{
log::debug('Task('.self::$taskid.') pages = '.$domain_pages_num.' more than '.self::$configs['max_pages'].", $url [Skip]");
return false;
}
}
//限制单域名最大花费时长 20180213
if (isset(self::$configs['max_duration']) and self::$configs['max_duration'] > 0)
{
$domain_duration = $this->get_duration_num($url);
if ($domain_duration > self::$configs['max_duration'])
{
log::debug('Task('.self::$taskid.') duration = '.$domain_duration.' more than '.self::$configs['max_duration'].", $url [Skip]");
return false;
}
}
//当前 host 并发检测 2018-5 BY KEN <a-site@foxmail.com>
if (self::$configs['max_task_per_host'] > 0)
{
$task_per_host = $this->get_task_per_host_num($url);
if ($task_per_host < self::$configs['max_task_per_host'])
{
$task_per_host = $this->incr_task_per_host($url);
}
else
{
log::warn('Task('.self::$taskid.') task_per_host = '.$task_per_host.' > '.self::$configs['max_task_per_host'].' ; URL: '.$url.' will be retry later...');
$this->queue_lpush($link); //放回队列
usleep(100000);
return false;
}
}
// 已采集页面数量 +1
$this->incr_collected_url_num($url);
// 爬取页面开始时间
$page_time_start = microtime(true);
// 下载页面前执行
// 比如有时需要根据某个特定的URL来决定这次的请求是否使用代理 / 或使用哪个代理
if ($this->on_before_download_page)
{
$return = call_user_func($this->on_before_download_page, $url, $link, $this);
if (isset($return)) $link = $return;
}
requests::$input_encoding = null;
$html = $this->request_url($url, $link);
//记录速度较慢域名花费抓取时间 20180213
$time_run = round(microtime(true) - $page_time_start);
if ($time_run > 1)
{
$this->incr_duration_num($url, $time_run);
}
// 爬完页面开始处理时间
$page_time_start = microtime(true);
if (!$html)
{
return false;
}
// 当前正在爬取的网页页面的对象
$page = array(
'url' => $url,
'raw' => $html,
'request' => array(
'url' => $url,
'method' => $link['method'],
'headers' => $link['headers'],
'params' => $link['params'],
'context_data' => $link['context_data'],
'try_num' => $link['try_num'],
'max_try' => $link['max_try'],
'depth' => $link['depth'],
'taskid' => self::$taskid,
),
);
//printf("memory usage: %.2f M\n", memory_get_usage() / 1024 / 1024 );
unset($html);
//--------------------------------------------------------------------------------
// 处理回调函数
//--------------------------------------------------------------------------------
// 判断当前网页是否被反爬虫了, 需要开发者实现
if ($this->is_anti_spider)
{
$is_anti_spider = call_user_func($this->is_anti_spider, $url, $page['raw'], $this);
// 如果在回调函数里面判断被反爬虫并且返回true
if ($is_anti_spider)
{
return false;
}
}
// 在一个网页下载完成之后调用. 主要用来对下载的网页进行处理.
// 比如下载了某个网页, 希望向网页的body中添加html标签
if ($this->on_download_page)
{
$return = call_user_func($this->on_download_page, $page, $this);
// 针对那些老是忘记return的人
if (isset($return))
{
$page = $return;
}
unset($return);
}
// 是否从当前页面分析提取URL
// 回调函数如果返回false表示不需要再从此网页中发现待爬url
$is_find_url = true;
if ($link['url_type'] == 'scan_page')
{
if ($this->on_scan_page)
{
$return = call_user_func($this->on_scan_page, $page, $page['raw'], $this);
if (isset($return))
{
$is_find_url = $return;
}
unset($return);
}
}
elseif ($link['url_type'] == 'content_page')
{
if ($this->on_content_page)
{
$return = call_user_func($this->on_content_page, $page, $page['raw'], $this);
if (isset($return))
{
$is_find_url = $return;
}
unset($return);
}
}
elseif ($link['url_type'] == 'list_page')
{
if ($this->on_list_page)
{
$return = call_user_func($this->on_list_page, $page, $page['raw'], $this);
if (isset($return))
{
$is_find_url = $return;
}
unset($return);
}
}
// on_scan_page、on_list_page、on_content_page 返回false表示不需要再从此网页中发现待爬url
if ($is_find_url)
{
// 如果深度没有超过最大深度, 获取下一级URL
if (self::$configs['max_depth'] == 0 || $link['depth'] < self::$configs['max_depth'])
{
// 分析提取HTML页面中的URL
$this->get_urls($page['raw'], $url, $link['depth'] + 1);
}
}
// 如果是内容页, 分析提取HTML页面中的字段
// 列表页也可以提取数据的, source_type: urlcontext, 未实现
if ($link['url_type'] == 'content_page')
{
$this->get_html_fields($page['raw'], $url, $page);
}
// 如果当前深度大于缓存的, 更新缓存
$this->incr_depth_num($link['depth']);
// 处理页面耗时时间
$time_run = round(microtime(true) - $page_time_start, 3);
log::debug('task id: '.self::$taskid." Success process page {$url} in {$time_run} s");
$spider_time_run = util::time2second(intval(microtime(true) - self::$time_start));
log::info('task id: '.self::$taskid." Spider running in {$spider_time_run}");
// 爬虫爬取每个网页的时间间隔, 单位: 毫秒
if (!isset(self::$configs['interval']))
{
// 默认睡眠100毫秒, 太快了会被认为是ddos
self::$configs['interval'] = 100;
}
usleep(self::$configs['interval'] * 1000);
}
/**
* 下载网页, 得到网页内容
*
* @param mixed $url
* @param mixed $link
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public function request_url($url, $link = array())
{
$time_start = microtime(true);
//$url = "http://www.qiushibaike.com/article/117568316";
// 设置了编码就不要让requests去判断了
if (isset(self::$configs['input_encoding']))
{
requests::$input_encoding = self::$configs['input_encoding'];
}
// 得到的编码如果不是utf-8的要转成utf-8, 因为xpath只支持utf-8
requests::$output_encoding = 'utf-8';
requests::set_timeout(self::$configs['timeout']);
requests::set_useragent(self::$configs['user_agent']);
// 先删除伪造IP
requests::del_client_ip();
// 是否设置了伪造IP
if (self::$configs['client_ip'])
{
requests::set_client_ip(self::$configs['client_ip']);
}
// 先删除代理免得前一个URL的代理被带过来了
requests::del_proxy();
// 是否设置了代理
if ($link['proxy'])
{
requests::set_proxy($link['proxy']);
}
// 如何设置了 HTTP Headers
if (!empty($link['headers']))
{
foreach ($link['headers'] as $k=>$v)
{
requests::set_header($k, $v);
}
}
//限制 http 请求模式为 get 或 post
$method = trim(strtolower($link['method']));
$method = ($method == 'post') ? 'post' : 'get';
$params = empty($link['params']) ? array() : $link['params'];
$html = requests::$method($url, $params);
// 此url附加的数据不为空, 比如内容页需要列表页一些数据, 拼接到后面去
if ($html && !empty($link['context_data']))
{
$html .= $link['context_data'];
}
$http_code = requests::$status_code;
//请求完成 host 的并发计数减 1 2018-5 BY KEN <a-site@foxmail.com>
if (self::$configs['max_task_per_host'] > 0)
{
$this->incr_task_per_host($url, 'decr');
}
if ($this->on_status_code)
{
$return = call_user_func($this->on_status_code, $http_code, $url, $html, $this);
if (isset($return))
{
$html = $return;
}
unset($return);
if ( ! $html)
{
return false;
}
}
if ($http_code != 200)
{
// 如果是301、302跳转, 抓取跳转后的网页内容
if ($http_code == 301 || $http_code == 302)
{
$info = requests::$info;
//if (isset($info['redirect_url']))
if (!empty($info['redirect_url']))
{
$url = $info['redirect_url'];
requests::$input_encoding = null;
$method = empty($link['method']) ? 'get' : strtolower($link['method']);
$params = empty($link['params']) ? array() : $link['params'];
$html = requests::$method($url, $params);
// 有跳转的就直接获取就好,不要调用自己,容易进入死循环
//$html = $this->request_url($url, $link);
if ($html && !empty($link['context_data']))
{
$html .= $link['context_data'];
}
}
else
{
return false;
}
}
else
{
if ( ! empty(self::$configs['max_try']) and $http_code == 407)
{
// 扔到队列头部去, 继续采集
$this->queue_rpush($link);
log::error("Failed to download page {$url}");
self::$collect_fail++;
}
elseif ( ! empty(self::$configs['max_try']) and in_array($http_code, array('0', '502', '503', '429')))
{
// 采集次数加一
$link['try_num']++;
// 抓取次数 小于 允许抓取失败次数
if ( $link['try_num'] <= $link['max_try'] )
{
// 扔到队列头部去, 继续采集
$this->queue_rpush($link);
}
log::error("Failed to download page {$url}, retry({$link['try_num']})");
}
else
{
log::error("Failed to download page {$url}");
self::$collect_fail++;
}
log::error("HTTP CODE: {$http_code}");
return false;
}
}
// 爬取页面耗时时间
$time_run = round(microtime(true) - $time_start, 3);
log::debug("Success download page {$url} in {$time_run} s");
self::$collect_succ++;
return $html;
}
/**
* 分析提取HTML页面中的URL
*
* @param mixed $html HTML内容
* @param mixed $collect_url 抓取的URL, 用来拼凑完整页面的URL
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public function get_urls($html, $collect_url, $depth = 0)
{
//--------------------------------------------------------------------------------
// 正则匹配出页面中的URL
//--------------------------------------------------------------------------------
$urls = selector::select($html, '//a/@href');
//preg_match_all("/<a.*href=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU", $html, $matchs);
//$urls = array();
//if (!empty($matchs[1]))
//{
//foreach ($matchs[1] as $url)
//{
//$urls[] = str_replace(array("\"", "'",'&amp;'), array("",'','&'), $url);
//}
//}
if (empty($urls))
{
return false;
}
// 如果页面上只有一个url要把他转为数组否则下面会报警告
if (!is_array($urls))
{
$urls = array($urls);
}
foreach ($urls as $key=>$url)
{
//限制最大子域名数量
if ( ! empty(self::$configs['max_sub_num']))
{
//抓取子域名超过超过指定值,就丢掉
$sub_domain_count = $this->sub_domain_count($url);
if ($sub_domain_count > self::$configs['max_sub_num'])
{
unset($urls[$key]);
log::debug('Task('.self::$taskid.') subdomin = '.$sub_domain_count.' more than '.self::$configs['max_sub_num'].",get_urls $url [Skip]");
continue;
}
}
$urls[$key] = str_replace(array('"', "'", '&amp;'), array('', '', '&'), $url);
}
//--------------------------------------------------------------------------------
// 过滤和拼凑URL
//--------------------------------------------------------------------------------
// 去除重复的URL
$urls = array_unique($urls);
foreach ($urls as $k=>$url)
{
$url = trim($url);
if (empty($url))
{
continue;
}
$val = $this->fill_url($url, $collect_url);
//限制单域名最大url数量 20180213
if ($val and isset(self::$configs['max_pages']) and self::$configs['max_pages'] > 0)
{
$domain_pages_num = $this->incr_pages_num($val);
if ($domain_pages_num > self::$configs['max_pages'])
{
continue;
}
}
if ($val)
{
$urls[$k] = $val;
}
else
{
unset($urls[$k]);
}
}
if (empty($urls))
{
return false;
}
//--------------------------------------------------------------------------------
// 把抓取到的URL放入队列
//--------------------------------------------------------------------------------
foreach ($urls as $url)
{
if ($this->on_fetch_url)
{
$return = call_user_func($this->on_fetch_url, $url, $this);
$url = isset($return) ? $return : $url;
unset($return);
// 如果 on_fetch_url 返回 false此URL不入队列
if (!$url)
{
continue;
}
}
// 把当前页当做找到的url的Referer页
$options = array(
'headers' => array(
'Referer' => $collect_url,
)
);
$this->add_url($url, $options, $depth);
}
}
/**
* 获得完整的连接地址
*
* @param mixed $url 要检查的URL
* @param mixed $collect_url 从那个URL页面得到上面的URL
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function fill_url($url, $collect_url)
{
$url = trim($url);
$collect_url = trim($collect_url);
// 排除JavaScript的连接
//if (strpos($url, "javascript:") !== false)
if (preg_match("@^(mailto|javascript:|#|'|\")@i", $url) || $url == '')
{
return false;
}
// 排除没有被解析成功的语言标签
if (substr($url, 0, 3) == '<%=' or substr($url, 0, 1) == '{' or substr($url, 0, 2) == ' {')
// if(substr($url, 0, 3) == '<%=')
{
return false;
}
$parse_url = @parse_url($collect_url);
if (empty($parse_url['scheme']) || empty($parse_url['host']))
{
return false;
}
// 过滤mailto、tel、sms、wechat、sinaweibo、weixin等协议
if ( ! in_array($parse_url['scheme'], array('http', 'https')))
{
return false;
}
$scheme = $parse_url['scheme'];
$domain = $parse_url['host'];
$path = empty($parse_url['path']) ? '' : $parse_url['path'];
$base_url_path = $domain.$path;
$base_url_path = preg_replace("/\/([^\/]*)\.(.*)$/", '/', $base_url_path);
$base_url_path = preg_replace("/\/$/", '', $base_url_path);
$i = $path_step = 0;
$dstr = $pstr = '';
$pos = strpos($url, '#');
if ($pos > 0)
{
// 去掉#和后面的字符串
$url = substr($url, 0, $pos);
}
// 修正url格式为 //www.jd.com/111.html 为正确的http
if (substr($url, 0, 2) == '//')
{
$url = preg_replace('/^\/\//iu', '', $url);
}
// /1234.html
elseif($url[0] == '/')
{
$url = $domain.$url;
}
// ./1234.html、../1234.html 这种类型的
elseif($url[0] == '.')
{
if(!isset($url[2]))
{
return false;
}
else
{
$urls = explode('/',$url);
foreach($urls as $u)
{
if( $u == '..' )
{
$path_step++;
}
// 遇到 ., 不知道为什么不直接写$u == '.', 貌似一样的
else if( $i < count($urls)-1 )
{
$dstr .= $urls[$i].'/';
}
else
{
$dstr .= $urls[$i];
}
$i++;
}
$urls = explode('/',$base_url_path);
if(count($urls) <= $path_step)
{
return false;
}
else
{
$pstr = '';
for($i=0;$i<count($urls)-$path_step;$i++){ $pstr .= $urls[$i].'/'; }
$url = $pstr.$dstr;
}
}
}
else
{
if( strtolower(substr($url, 0, 7))=='http://' )
{
$url = preg_replace('#^http://#i', '', $url);
$scheme = 'http';
}
else if( strtolower(substr($url, 0, 8))=='https://' )
{
$url = preg_replace('#^https://#i','',$url);
$scheme = "https";
}
// 相对路径,像 1111.html 这种
else
{
$arr = explode("/", $base_url_path);
// 去掉空值
$arr = array_filter($arr);
$base_url_path = implode("/", $arr);
$url = $base_url_path.'/'.$url;
}
}
// 两个 / 或以上的替换成一个 /
$url = preg_replace('/\/{1,}/i', '/', $url);
$url = $scheme.'://'.$url;
$parse_url = @parse_url($url);
$domain = empty($parse_url['host']) ? $domain : $parse_url['host'];
// 如果host不为空, 判断是不是要爬取的域名
if ( ! empty($parse_url['host']))
{
//2018-1-3 通配所有域名
if (empty(self::$configs['domains']) or self::$configs['domains'][0] == '*')
{
return $url;
}
//排除非域名下的url以提高爬取速度
if (!in_array($parse_url['host'], self::$configs['domains']))
{
return false;
}
}
return $url;
}
/**
* 连接对象压缩
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-11-05 18:58
*/
public function link_compress($link)
{
if (empty($link['url_type']))
{
unset($link['url_type']);
}
if (empty($link['method']) || strtolower($link['method']) == 'get')
{
unset($link['method']);
}
if (empty($link['headers']))
{
unset($link['headers']);
}
if (empty($link['params']))
{
unset($link['params']);
}
if (empty($link['context_data']))
{
unset($link['context_data']);
}
if (empty($link['proxy']))
{
unset($link['proxy']);
}
if (empty($link['try_num']))
{
unset($link['try_num']);
}
if (empty($link['max_try']))
{
unset($link['max_try']);
}
if (empty($link['depth']))
{
unset($link['depth']);
}
//$json = json_encode($link);
//$json = gzdeflate($json);
return $link;
}
/**
* 连接对象解压缩
*
* @param mixed $link
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-11-05 18:58
*/
public function link_uncompress($link)
{
$link = array(
'url' => isset($link['url']) ? $link['url'] : '',
'url_type' => isset($link['url_type']) ? $link['url_type'] : '',
'method' => isset($link['method']) ? $link['method'] : 'get',
'headers' => isset($link['headers']) ? $link['headers'] : array(),
'params' => isset($link['params']) ? $link['params'] : array(),
'context_data' => isset($link['context_data']) ? $link['context_data'] : '',
'proxy' => isset($link['proxy']) ? $link['proxy'] : self::$configs['proxy'],
'try_num' => isset($link['try_num']) ? $link['try_num'] : 0,
'max_try' => isset($link['max_try']) ? $link['max_try'] : self::$configs['max_try'],
'depth' => isset($link['depth']) ? $link['depth'] : 0,
);
return $link;
}
/**
* 分析提取HTML页面中的字段
*
* @param mixed $html
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public function get_html_fields($html, $url, $page)
{
$fields = $this->get_fields(self::$configs['fields'], $html, $url, $page);
if (!empty($fields))
{
if ($this->on_extract_page)
{
$return = call_user_func($this->on_extract_page, $page, $fields);
if (!isset($return))
{
log::warn("on_extract_page return value can't be empty");
}
// 返回false跳过当前页面内容不入库
elseif ($return === false)
{
return false;
}
elseif (!is_array($return))
{
log::warn('on_extract_page return value must be an array');
}
else
{
$fields = $return;
}
}
if (isset($fields) && is_array($fields))
{
$fields_num = $this->incr_fields_num();
if (self::$configs['max_fields'] != 0 && $fields_num > self::$configs['max_fields'])
{
exit(0);
}
if (version_compare(PHP_VERSION,'5.4.0','<'))
{
$fields_str = json_encode($fields);
$fields_str = preg_replace_callback("#\\\u([0-9a-f]{4})#i", function ($matchs)
{
return @iconv('UCS-2BE', 'UTF-8', pack('H4', $matchs[1]));
}, $fields_str);
}
else
{
$fields_str = json_encode($fields, JSON_UNESCAPED_UNICODE);
}
if (util::is_win())
{
$fields_str = mb_convert_encoding($fields_str, 'gb2312', 'utf-8');
}
log::info("Result[{$fields_num}]: ".$fields_str);
// 如果设置了导出选项
if (!empty(self::$configs['export']))
{
self::$export_type = isset(self::$configs['export']['type']) ? self::$configs['export']['type'] : '';
if (self::$export_type == 'csv')
{
util::put_file(self::$export_file, util::format_csv($fields)."\n", FILE_APPEND);
}
elseif (self::$export_type == 'sql')
{
$sql = db::insert(self::$export_table, $fields, true);
util::put_file(self::$export_file, $sql.";\n", FILE_APPEND);
}
elseif (self::$export_type == 'db')
{
db::insert(self::$export_table, $fields);
}
}
}
}
}
/**
* 根据配置提取HTML代码块中的字段
*
* @param mixed $confs
* @param mixed $html
* @param mixed $page
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function get_fields($confs, $html, $url, $page)
{
$fields = array();
foreach ($confs as $conf)
{
// 当前field抽取到的内容是否是有多项
$repeated = isset($conf['repeated']) && $conf['repeated'] ? true : false;
// 当前field抽取到的内容是否必须有值
$required = isset($conf['required']) && $conf['required'] ? true : false;
if (empty($conf['name']))
{
log::error("The field name is null, please check your \"fields\" and add the name of the field\n");
exit;
}
$values = NULL;
// 如果定义抽取规则
if (!empty($conf['selector']))
{
// 如果这个field是上一个field的附带连接
if (isset($conf['source_type']) && $conf['source_type']=='attached_url')
{
// 取出上个field的内容作为连接, 内容分页是不进队列直接下载网页的
if (!empty($fields[$conf['attached_url']]))
{
$collect_url = $this->fill_url($fields[$conf['attached_url']], $url);
log::debug("Find attached content page: {$collect_url}");
$link['url'] = $collect_url;
$link = $this->link_uncompress($link);
requests::$input_encoding = null;
$html = $this->request_url($collect_url, $link);
// 在一个attached_url对应的网页下载完成之后调用. 主要用来对下载的网页进行处理.
if ($this->on_download_attached_page)
{
$return = call_user_func($this->on_download_attached_page, $html, $this);
if (isset($return))
{
$html = $return;
}
}
// 请求获取完分页数据后把连接删除了
unset($fields[$conf['attached_url']]);
}
}
// 没有设置抽取规则的类型 或者 设置为 xpath
if (!isset($conf['selector_type']) || $conf['selector_type']=='xpath')
{
// 如果找不到返回的是false
$values = $this->get_fields_xpath($html, $conf['selector'], $conf['name']);
}
elseif ($conf['selector_type']=='css')
{
$values = $this->get_fields_css($html, $conf['selector'], $conf['name']);
}
elseif ($conf['selector_type']=='regex')
{
$values = $this->get_fields_regex($html, $conf['selector'], $conf['name']);
}
// field不为空而且存在子配置
if (isset($values) && !empty($conf['children']))
{
// 如果提取到的结果是字符串就转为数组方便下面统一foreach
if (!is_array($values))
{
$values = array($values);
}
$child_values = array();
// 父项抽取到的html作为子项的提取内容
foreach ($values as $child_html)
{
// 递归调用本方法, 所以多少子项目都支持
$child_value = $this->get_fields($conf['children'], $child_html, $url, $page);
if (!empty($child_value))
{
$child_values[] = $child_value;
}
}
// 有子项就存子项的数组, 没有就存HTML代码块
if (!empty($child_values))
{
$values = $child_values;
}
}
}
if (!isset($values))
{
// 如果值为空而且值设置为必须项, 跳出foreach循环
if ($required)
{
log::warn("Selector {$conf['name']}[{$conf['selector']}] not found, It's a must");
// 清空整个 fields当前页面就等于略过了
$fields = array();
break;
}
// 避免内容分页时attached_url拼接时候string + array了
$fields[$conf['name']] = '';
//$fields[$conf['name']] = array();
}
else
{
if (is_array($values))
{
if ($repeated)
{
$fields[$conf['name']] = $values;
}
else
{
$fields[$conf['name']] = $values[0];
}
}
else
{
$fields[$conf['name']] = $values;
}
// 不重复抽取则只取第一个元素
//$fields[$conf['name']] = $repeated ? $values : $values[0];
}
}
if (!empty($fields))
{
foreach ($fields as $fieldname => $data)
{
$pattern = "/<img\s+.*?src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isu";
/*$pattern = "/<img.*?src=[\'|\"](.*?(?:[\.gif|\.jpg|\.jpeg|\.png]))[\'|\"].*?[\/]?>/i"; */
// 在抽取到field内容之后调用, 对其中包含的img标签进行回调处理
if ($this->on_handle_img && preg_match($pattern, $data))
{
$return = call_user_func($this->on_handle_img, $fieldname, $data);
if (!isset($return))
{
log::warn("on_handle_img return value can't be empty\n");
}
else
{
// 有数据才会执行 on_handle_img 方法, 所以这里不要被替换没了
$data = $return;
}
}
// 当一个field的内容被抽取到后进行的回调, 在此回调中可以对网页中抽取的内容作进一步处理
if ($this->on_extract_field)
{
$return = call_user_func($this->on_extract_field, $fieldname, $data, $page);
if (!isset($return))
{
log::warn("on_extract_field return value can't be empty\n");
}
else
{
// 有数据才会执行 on_extract_field 方法, 所以这里不要被替换没了
$fields[$fieldname] = $return;
}
}
}
}
return $fields;
}
/**
* 验证导出
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-02 23:37
*/
public function check_export()
{
// 如果设置了导出选项
if (!empty(self::$configs['export']))
{
if (self::$export_type == 'csv')
{
if (empty(self::$export_file))
{
log::error('Export data into CSV files need to Set the file path.');
exit;
}
}
elseif (self::$export_type == 'sql')
{
if (empty(self::$export_file))
{
log::error('Export data into SQL files need to Set the file path.');
exit;
}
}
elseif (self::$export_type == 'db')
{
if (!function_exists('mysqli_connect'))
{
log::error('Export data to a database need Mysql support, unable to load mysqli extension.');
exit;
}
if (empty(self::$db_config))
{
log::error('Export data to a database need Mysql support, you have not set a config array for connect.');
exit;
}
$config = self::$db_config;
@mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']);
if(mysqli_connect_errno())
{
log::error('Export data to a database need Mysql support, '.mysqli_connect_error());
exit;
}
db::set_connect('default', $config);
db::_init();
if (!db::table_exists(self::$export_table))
{
log::error('Table '.self::$export_table.' does not exist');
exit;
}
}
}
}
public function check_cache()
{
if ( !self::$use_redis || self::$save_running_state)
{
return false;
}
// 这个位置要改
//$keys = queue::keys("*");
//$count = count($keys);
// 直接检查db清空的时候整个db清空所以注意db不要跟其他项目混用
$count = queue::dbsize();
if ( $count > 0 )
{
// After this operation, 4,318 kB of additional disk space will be used.
// Do you want to continue? [Y/n]
//$msg = "发现Redis中有采集数据, 是否继续执行, 不继续则清空Redis数据重新采集\n";
$msg = "Found that the data of Redis, no continue will empty Redis data start again\n";
$msg .= 'Do you want to continue? [Y/n]';
fwrite(STDOUT, $msg);
$arg = strtolower(trim(fgets(STDIN)));
$arg = empty($arg) || !in_array($arg, array('Y', 'N', 'y','n')) ? 'y' : strtolower($arg);
if ($arg == 'n')
{
log::warn('Clear redis data...');
queue::flushdb();
// 下面这种性能太差了
//foreach ($keys as $key)
//{
//$key = str_replace(self::$queue_config['prefix'].':', '', $key);
//queue::del($key);
//}
}
}
}
public function init_redis()
{
if (!self::$use_redis)
{
return false;
}
// 添加当前服务器到服务器列表
$this->add_server_list(self::$serverid, self::$tasknum);
// 删除当前服务器的任务状态
// 对于被强制退出的进程有用
for ($i = 1; $i <= self::$tasknum; $i++)
{
$this->del_task_status(self::$serverid, $i);
}
}
/**
* 设置任务状态, 主进程和子进程每成功采集一个页面后调用
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-30 23:56
*/
public function set_task_status()
{
// 每采集成功一个页面, 生成当前进程状态到文件, 供主进程使用
$mem = round(memory_get_usage(true)/(1024*1024),2);
$use_time = microtime(true) - self::$time_start;
$speed = round((self::$collect_succ + self::$collect_fail) / $use_time, 2);
$status = array(
'id' => self::$taskid,
'pid' => self::$taskpid,
'mem' => $mem,
'collect_succ' => self::$collect_succ,
'collect_fail' => self::$collect_fail,
'speed' => $speed,
);
$task_status = json_encode($status);
if (self::$use_redis)
{
$key = 'server-'.self::$serverid.'-task_status-'.self::$taskid;
queue::set($key, $task_status);
}
else
{
self::$task_status = array($task_status);
}
}
/**
* 删除任务状态
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-11-16 11:06
*/
public function del_task_status($serverid, $taskid)
{
if (!self::$use_redis)
{
return false;
}
$key = "server-{$serverid}-task_status-{$taskid}";
queue::del($key);
}
/**
* 获得任务状态, 主进程才会调用
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-30 23:56
*/
public function get_task_status($serverid, $taskid)
{
if (!self::$use_redis)
{
return false;
}
$key = "server-{$serverid}-task_status-{$taskid}";
$task_status = queue::get($key);
return $task_status;
}
/**
* 获得任务状态, 主进程才会调用
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-10-30 23:56
*/
public function get_task_status_list($serverid = 1, $tasknum)
{
$task_status = array();
if (self::$use_redis)
{
for ($i = 1; $i <= $tasknum; $i++)
{
$key = "server-{$serverid}-task_status-".$i;
$task_status[] = queue::get($key);
}
}
else
{
$task_status = self::$task_status;
}
return $task_status;
}
/**
* 添加当前服务器信息到服务器列表
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-11-16 11:06
*/
public function add_server_list($serverid, $tasknum)
{
if (!self::$use_redis)
{
return false;
}
// 更新服务器列表
$server_list_json = queue::get('server_list');
$server_list = array();
if ( ! $server_list_json)
{
$server_list[$serverid] = array(
'serverid' => $serverid,
'tasknum' => $tasknum,
'time' => time(),
);
}
else
{
$server_list = json_decode($server_list_json, true);
$server_list[$serverid] = array(
'serverid' => $serverid,
'tasknum' => $tasknum,
'time' => time(),
);
ksort($server_list);
}
queue::set('server_list', json_encode($server_list));
}
/**
* 从服务器列表中删除当前服务器信息
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-11-16 11:06
*/
public function del_server_list($serverid)
{
if (!self::$use_redis)
{
return false;
}
$server_list_json = queue::get('server_list');
$server_list = array();
if ($server_list_json)
{
$server_list = json_decode($server_list_json, true);
if (isset($server_list[$serverid]))
{
unset($server_list[$serverid]);
}
// 删除完当前的任务列表如果还存在就更新一下Redis
if (!empty($server_list))
{
ksort($server_list);
queue::set('server_list', json_encode($server_list));
}
}
}
/**
* 获取等待爬取页面数量
*
* @param mixed $url
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function get_collect_url_num()
{
if (self::$use_redis)
{
$count = queue::get('collect_urls_num');
}
else
{
$count = self::$collect_urls_num;
}
return $count;
}
/**
* 获取已经爬取页面数量
*
* @param mixed $url
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function get_collected_url_num()
{
if (self::$use_redis)
{
$count = queue::get('collected_urls_num');
}
else
{
$count = self::$collected_urls_num;
}
return $count;
}
/**
* 已采集页面数量加一
*
* @param mixed $url
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function incr_collected_url_num($url)
{
if (self::$use_redis)
{
queue::incr('collected_urls_num');
}
else
{
self::$collected_urls_num++;
}
}
/**
* 从队列左边插入
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function queue_lpush($link = array(), $allowed_repeat = false)
{
if (empty($link) || empty($link['url']))
{
return false;
}
$url = $link['url'];
$link = $this->link_compress($link);
$status = false;
if (self::$use_redis)
{
$key = 'collect_urls-'.md5($url);
$lock = 'lock-'.$key;
// 加锁: 一个进程一个进程轮流处理
if (queue::lock($lock))
{
$exists = queue::exists($key);
// 不存在或者当然URL可重复入
if (!$exists || $allowed_repeat)
{
// 待爬取网页记录数加一
queue::incr('collect_urls_num');
// 先标记为待爬取网页
queue::set($key, time());
// 入队列
$link = json_encode($link);
//根据采集设置为顺序采集还是随机采集,使用列表或集合对象 2018-5 BY KEN <a-site@foxmail.com>
if (self::$configs['queue_order'] == 'rand')
{
queue::sadd('collect_queue', $link);
}
else
{
queue::lpush('collect_queue', $link);
}
$status = true;
}
// 解锁
queue::unlock($lock);
}
}
else
{
$key = md5($url);
if (!array_key_exists($key, self::$collect_urls))
{
self::$collect_urls_num++;
self::$collect_urls[$key] = time();
array_push(self::$collect_queue, $link);
$status = true;
}
}
return $status;
}
/**
* 从队列右边插入
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function queue_rpush($link = array(), $allowed_repeat = false)
{
if (empty($link) || empty($link['url']))
{
return false;
}
$url = $link['url'];
$status = false;
if (self::$use_redis)
{
$key = 'collect_urls-'.md5($url);
$lock = 'lock-'.$key;
// 加锁: 一个进程一个进程轮流处理
if (queue::lock($lock))
{
$exists = queue::exists($key);
// 不存在或者当然URL可重复入
if ( ! $exists || $allowed_repeat)
{
// 待爬取网页记录数加一
queue::incr('collect_urls_num');
// 先标记为待爬取网页
queue::set($key, time());
// 入队列
$link = json_encode($link);
//根据采集设置为顺序采集还是随机采集,使用列表或集合对象 2018-5 BY KEN <a-site@foxmail.com>
if (self::$configs['queue_order'] == 'rand')
{
queue::sadd('collect_queue', $link); //无序集合
}
else
{
queue::rpush('collect_queue', $link); //有序列表
}
$status = true;
}
// 解锁
queue::unlock($lock);
}
}
else
{
$key = md5($url);
if (!array_key_exists($key, self::$collect_urls))
{
self::$collect_urls_num++;
self::$collect_urls[$key] = time();
array_unshift(self::$collect_queue, $link);
$status = true;
}
}
return $status;
}
/**
* 从队列左边取出
* 后进先出
* 可以避免采集内容页有分页的时候采集失败数据拼凑不全
* 还可以按顺序采集列表页
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function queue_lpop()
{
if (self::$use_redis)
{
//根据采集设置为顺序采集还是随机采集,使用列表或集合对象
if (self::$configs['queue_order'] == 'rand')
{
$link = queue::spop('collect_queue');
}
else
{
$link = queue::lpop('collect_queue');
}
$link = json_decode($link, true);
}
else
{
$link = array_pop(self::$collect_queue);
}
return $link;
}
/**
* 从队列右边取出
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function queue_rpop()
{
if (self::$use_redis)
{
//根据采集设置为顺序采集还是随机采集,使用列表或集合对象
if (self::$configs['queue_order'] == 'rand')
{
$link = queue::spop('collect_queue');
}
else
{
$link = queue::rpop('collect_queue');
}
$link = json_decode($link, true);
}
else
{
$link = array_shift(self::$collect_queue);
}
return $link;
}
/**
* 队列长度
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function queue_lsize()
{
if (self::$use_redis)
{
//根据采集设置为顺序采集还是随机采集,使用列表或集合对象
if (self::$configs['queue_order'] == 'rand')
{
$lsize = queue::scard('collect_queue');
}
else
{
$lsize = queue::lsize('collect_queue');
}
}
else
{
$lsize = count(self::$collect_queue);
}
return $lsize;
}
/**
* 采集深度加一
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function incr_depth_num($depth)
{
if (self::$use_redis)
{
$lock = 'lock-depth_num';
// 锁2秒
if (queue::lock($lock, time(), 2))
{
if (queue::get('depth_num') < $depth)
{
queue::set('depth_num', $depth);
}
queue::unlock($lock);
}
}
else
{
if (self::$depth_num < $depth)
{
self::$depth_num = $depth;
}
}
}
/**
* 获得采集深度
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function get_depth_num()
{
if (self::$use_redis)
{
$depth_num = queue::get('depth_num');
return $depth_num ? $depth_num : 0;
}
else
{
return self::$depth_num;
}
}
/**
* 提取到的field数目加一
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function incr_fields_num()
{
if (self::$use_redis)
{
$fields_num = queue::incr('fields_num');
}
else
{
self::$fields_num++;
$fields_num = self::$fields_num;
}
return $fields_num;
}
/**
* 提取到的field数目
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
public function get_fields_num()
{
if (self::$use_redis)
{
$fields_num = queue::get('fields_num');
}
else
{
$fields_num = self::$fields_num;
}
return $fields_num ? $fields_num : 0;
}
/**
* 提取到的pages数目加一用于限制单域名采集页数上限
*
* @return void
* @author KEN <a-site@foxmail.com>
* @created time :2018-05
*/
public function incr_pages_num($url = '')
{
if ( ! empty($url))
{
$domain = $this->getRootDomain($url, 'host');
}
if (empty($domain))
{
$domain = 'all';
}
if (self::$use_redis)
{
$pages_num[$domain] = queue::incr('pages_num:'.$domain);
}
else
{
if (empty(self::$pages_num[$domain]))
{
self::$pages_num[$domain] = 1;
}
else
{
self::$pages_num[$domain]++;
}
$pages_num[$domain] = self::$pages_num[$domain];
}
return $pages_num[$domain];
}
/**
* 超过1秒的慢速采集时间计数用于限制单域名总采集时间上限
*
* @return void
* @author KEN <a-site@foxmail.com>
* @created time :2018-05
*/
public function incr_duration_num($url = '', $time_run = 1)
{
if ( ! empty($url))
{
$domain = $this->getRootDomain($url);
}
if (empty($domain))
{
$domain = 'all';
}
if (self::$use_redis)
{
$duration[$domain] = queue::incr('duration:'.$domain, $time_run);
}
else
{
if (empty(self::$duration[$domain]))
{
self::$duration[$domain] = $time_run;
}
else
{
self::$duration[$domain] += $time_run;
}
$duration[$domain] = self::$duration[$domain];
}
return $duration[$domain];
}
/**
* 读取单域名总慢速采集响应超过1秒的时间
*
* @return void
* @author KEN <a-site@foxmail.com>
* @created time :2018-04
*/
public function get_duration_num($url = '')
{
if ( ! empty($url))
{
$domain = $this->getRootDomain($url);
}
if (empty($domain))
{
$domain = 'all';
}
if (self::$use_redis)
{
$duration[$domain] = queue::get('duration:'.$domain);
}
else
{
$duration[$domain] = ! empty(self::$duration[$domain]) ? self::$duration[$domain] : 0;
}
return $duration[$domain] ? $duration[$domain] : 0;
}
/**
* 单 host 当前并发计数
* @return int
* @author KEN <a-site@foxmail.com>
* @created time :2018-05-28 16:40
*/
public function incr_task_per_host($url = '', $type = 'incr')
{
if (empty($url))
{
return false;
}
$domain = $this->getRootDomain($url, 'host');
if (empty($domain))
{
return false;
}
if (self::$use_redis)
{
if ($type == 'decr')
{
$task_per_host_counter[$domain] = queue::decr('task_per_host:'.$domain);
}
else
{
$task_per_host_counter[$domain] = queue::incr('task_per_host:'.$domain);
}
}
else
{
if (empty(self::$task_per_host_counter[$domain]))
{
self::$task_per_host_counter[$domain] = 1;
}
else
{
if ($type == 'decr')
{
self::$task_per_host_counter[$domain]--;
}
else
{
self::$task_per_host_counter[$domain]++;
}
}
$task_per_host_counter[$domain] = self::$task_per_host_counter[$domain];
}
return $task_per_host_counter[$domain];
}
//获取url所属 host 当前并发数量 KEN <a-site@foxmail.com>
public function get_task_per_host_num($url)
{
if (empty($url))
{
return 0;
}
$domain = $this->getRootDomain($url, 'host');
if (empty($domain))
{
return 0;
}
if (self::$use_redis)
{
$count = queue::get('task_per_host:'.$domain);
}
else
{
$count = self::$task_per_host_counter[$domain];
}
return $count;
}
/**
* 采用xpath分析提取字段
*
* @param mixed $html
* @param mixed $selector
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public function get_fields_xpath($html, $selector, $fieldname)
{
$result = selector::select($html, $selector);
if (selector::$error)
{
log::error("Field(\"{$fieldname}\") ".selector::$error."\n");
}
return $result;
}
/**
* 采用正则分析提取字段
*
* @param mixed $html
* @param mixed $selector
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public function get_fields_regex($html, $selector, $fieldname)
{
$result = selector::select($html, $selector, 'regex');
if (selector::$error)
{
log::error("Field(\"{$fieldname}\") ".selector::$error."\n");
}
return $result;
}
/**
* 采用CSS选择器提取字段
*
* @param mixed $html
* @param mixed $selector
* @param mixed $fieldname
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public function get_fields_css($html, $selector, $fieldname)
{
$result = selector::select($html, $selector, 'css');
if (selector::$error)
{
log::error("Field(\"{$fieldname}\") ".selector::$error."\n");
}
return $result;
}
/**
* 清空shell输出内容
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-11-16 11:06
*/
public function clear_echo()
{
$arr = array(27, 91, 72, 27, 91, 50, 74);
foreach ($arr as $a)
{
print chr($a);
}
//array_map(create_function('$a', 'print chr($a);'), array(27, 91, 72, 27, 91, 50, 74));
}
/**
* 替换shell输出内容
*
* @param mixed $message
* @param mixed $force_clear_lines
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-11-16 11:06
*/
public function replace_echo($message, $force_clear_lines = NULL)
{
static $last_lines = 0;
if(!is_null($force_clear_lines))
{
$last_lines = $force_clear_lines;
}
// 获取终端宽度
$toss = $status = null;
$term_width = exec('tput cols', $toss, $status);
if($status || empty($term_width))
{
$term_width = 64; // Arbitrary fall-back term width.
}
$line_count = 0;
foreach(explode("\n", $message) as $line)
{
$line_count += count(str_split($line, $term_width));
}
// Erasure MAGIC: Clear as many lines as the last output had.
for($i = 0; $i < $last_lines; $i++)
{
// Return to the beginning of the line
echo "\r";
// Erase to the end of the line
echo "\033[K";
// Move cursor Up a line
echo "\033[1A";
// Return to the beginning of the line
echo "\r";
// Erase to the end of the line
echo "\033[K";
// Return to the beginning of the line
echo "\r";
// Can be consolodated into
// echo "\r\033[K\033[1A\r\033[K\r";
}
$last_lines = $line_count;
echo $message."\n";
}
/**
* 展示启动界面, Windows 不会到这里来
* @return void
*/
public function display_ui()
{
$loadavg = sys_getloadavg();
foreach ($loadavg as $k=>$v)
{
$loadavg[$k] = round($v, 2);
}
$display_str = "\033[1A\n\033[K-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m";
//$display_str = "-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m";
$run_time_str = util::time2second(time() - self::$time_start, false);
$display_str .= 'PHPSpider version:'.self::VERSION.' PHP version:'.PHP_VERSION."\n";
$display_str .= 'start time:'.date('Y-m-d H:i:s', self::$time_start).' run '.$run_time_str." \n";
$display_str .= 'spider name: '.self::$configs['name']."\n";
if (self::$multiserver)
{
$display_str .= 'server id: '.self::$serverid."\n";
}
$display_str .= 'task number: '.self::$tasknum."\n";
$display_str .= 'load average: '.implode(', ', $loadavg)."\n";
$display_str .= "document: https://doc.phpspider.org\n";
$display_str .= $this->display_task_ui();
if (self::$multiserver)
{
$display_str .= $this->display_server_ui();
}
$display_str .= $this->display_collect_ui();
// 清屏
//$this->clear_echo();
// 返回到第一行,第一列
//echo "\033[0;0H";
$display_str .= "---------------------------------------------------------------------\n";
$display_str .= 'Press Ctrl-C to quit. Start success.'.date('Y-m-d H:i:s').' - '.round(memory_get_usage() / 1024 / 1024, 2).'MB'."\n";
if (self::$terminate)
{
$display_str .= "\n\033[33mWait for the process exits...\033[0m";
}
//echo $display_str;
$this->replace_echo($display_str);
}
public function display_task_ui()
{
$display_str = "-------------------------------\033[47;30m TASKS \033[0m-------------------------------\n";
$display_str .= "\033[47;30mtaskid\033[0m". str_pad('', self::$taskid_length+2-strlen('taskid')).
"\033[47;30mtaskpid\033[0m". str_pad('', self::$pid_length+2-strlen('taskpid')).
"\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')).
"\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')).
"\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')).
"\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')).
"\n";
// "\033[32;40m [OK] \033[0m"
$task_status = $this->get_task_status_list(self::$serverid, self::$tasknum);
foreach ($task_status as $json)
{
$task = json_decode($json, true);
if (empty($task))
{
continue;
}
$display_str .= str_pad($task['id'], self::$taskid_length + 2).
str_pad($task['pid'], self::$pid_length + 2).
str_pad($task['mem'].'MB', self::$mem_length + 2).
str_pad($task['collect_succ'], self::$urls_length).
str_pad($task['collect_fail'], self::$urls_length).
str_pad($task['speed'].'/s', self::$speed_length + 2).
"\n";
}
//echo "\033[9;0H";
return $display_str;
}
public function display_server_ui()
{
$display_str = "-------------------------------\033[47;30m SERVER \033[0m------------------------------\n";
$display_str .= "\033[47;30mserver\033[0m". str_pad('', self::$server_length+2-strlen('serverid')).
"\033[47;30mtasknum\033[0m". str_pad('', self::$tasknum_length+2-strlen('tasknum')).
"\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')).
"\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')).
"\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')).
"\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')).
"\n";
$server_list_json = queue::get('server_list');
$server_list = json_decode($server_list_json, true);
foreach ($server_list as $server)
{
$serverid = $server['serverid'];
$tasknum = $server['tasknum'];
$mem = 0;
$speed = 0;
$collect_succ = $collect_fail = 0;
$task_status = $this->get_task_status_list($serverid, $tasknum);
foreach ($task_status as $json)
{
$task = json_decode($json, true);
if (empty($task))
{
continue;
}
$mem += $task['mem'];
$speed += $task['speed'];
$collect_fail += $task['collect_fail'];
$collect_succ += $task['collect_succ'];
}
$display_str .= str_pad($serverid, self::$server_length).
str_pad($tasknum, self::$tasknum_length + 2).
str_pad($mem.'MB', self::$mem_length + 2).
str_pad($collect_succ, self::$urls_length).
str_pad($collect_fail, self::$urls_length).
str_pad($speed.'/s', self::$speed_length + 2).
"\n";
}
return $display_str;
}
public function display_collect_ui()
{
$display_str = "---------------------------\033[47;30m COLLECT STATUS \033[0m--------------------------\n";
$display_str .= "\033[47;30mfind pages\033[0m". str_pad('', 16-strlen('find pages')).
"\033[47;30mqueue\033[0m". str_pad('', 14-strlen('queue')).
"\033[47;30mcollected\033[0m". str_pad('', 15-strlen('collected')).
"\033[47;30mfields\033[0m". str_pad('', 15-strlen('fields')).
"\033[47;30mdepth\033[0m". str_pad('', 12-strlen('depth')).
"\n";
$collect = $this->get_collect_url_num();
$collected = $this->get_collected_url_num();
$queue = $this->queue_lsize();
$fields = $this->get_fields_num();
$depth = $this->get_depth_num();
$display_str .= str_pad($collect, 16);
$display_str .= str_pad($queue, 14);
$display_str .= str_pad($collected, 15);
$display_str .= str_pad($fields, 15);
$display_str .= str_pad($depth, 12);
$display_str .= "\n";
return $display_str;
}
/**
* 判断是否附件文件
*
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-23 17:13
*/
//public function is_attachment_file($url)
//{
//$mime_types = $GLOBALS['config']['mimetype'];
//$mime_types_flip = array_flip($mime_types);
//$pathinfo = pathinfo($url);
//$fileext = isset($pathinfo['extension']) ? $pathinfo['extension'] : '';
//$fileinfo = array();
//// 存在文件后缀并且是配置里面的后缀
//if (!empty($fileext) && isset($mime_types_flip[$fileext]))
//{
//stream_context_set_default(
//array(
//'http' => array(
//'method' => 'HEAD'
//)
//)
//);
//// 代理和Cookie以后实现, 方法和 file_get_contents 一样 使用 stream_context_create 设置
//$headers = get_headers($url, 1);
//if (strpos($headers[0], '302'))
//{
//$url = $headers['Location'];
//$headers = get_headers($url, 1);
//}
////print_r($headers);
//$fileinfo = array(
//'basename' => isset($pathinfo['basename']) ? $pathinfo['basename'] : '',
//'filename' => isset($pathinfo['filename']) ? $pathinfo['filename'] : '',
//'fileext' => isset($pathinfo['extension']) ? $pathinfo['extension'] : '',
//'filesize' => isset($headers['Content-Length']) ? $headers['Content-Length'] : 0,
//'atime' => isset($headers['Date']) ? strtotime($headers['Date']) : time(),
//'mtime' => isset($headers['Last-Modified']) ? strtotime($headers['Last-Modified']) : time(),
//);
//$mime_type = 'html';
//$content_type = isset($headers['Content-Type']) ? $headers['Content-Type'] : '';
//if (!empty($content_type))
//{
//$mime_type = isset($GLOBALS['config']['mimetype'][$content_type]) ? $GLOBALS['config']['mimetype'][$content_type] : $mime_type;
//}
//$mime_types_flip = array_flip($mime_types);
//// 判断一下是不是文件名被加什么后缀了, 比如 http://www.xxxx.com/test.jpg?token=xxxxx
//if (!isset($mime_types_flip[$fileinfo['fileext']]))
//{
//$fileinfo['fileext'] = $mime_type;
//$fileinfo['basename'] = $fileinfo['filename'].'.'.$mime_type;
//}
//}
//return $fileinfo;
//}
//返回当前是否是主进程
public function is_taskmaster()
{
return self::$taskmaster;
}
//返回当前是否进程ID
public function get_task_id()
{
return self::$taskid;
}
//检测子域名数量
public function sub_domain_count($url)
{
if (empty($url))
{
return 0;
}
$count = 0;
$domain = $this->getRootDomain($url, 'root');
if (empty($domain))
{
return 0;
}
$host = $this->getRootDomain($url, 'host');
if (empty($host))
{
return $count;
}
if (self::$use_redis)
{
$count = queue::get($domain);
if ( ! empty(self::$configs['max_sub_num']) and $count > self::$configs['max_sub_num'])
{
return $count;
}
if (strlen($host) > 32)
{
$host = md5($host);
}
$hostkey = 'sub_d-'.$host;
$exists = queue::exists($hostkey);
if ( ! $exists)
{
// 子域名数量加一
$count = queue::incr($domain);
queue::set($hostkey, 1);
}
}
return $count;
}
//提取url的根域名 host domain subdomain name tld
public function getRootDomain($url = '', $type = 'root', $domain_check = false)
{
if (empty($url))
{
return $url;
}
$url = trim($url);
if ( ! preg_match('/^http/i', $url))
{
$url = 'http://'.$url;
}
//截取限定字符
$arr = array();
if (preg_match_all('/(^https?:\/\/[\p{Han}a-zA-Z0-9\-\.\/]+)/iu', $url, $arr))
{
$url = $arr['0']['0'];
unset($arr);
}
$url_parse = parse_url(strtolower($url));
if (empty($url_parse['host']))
{
return '';
}
//host判断快速返回
if ($domain_check === false and $type == 'host')
{
return $url_parse['host'];
}
//结束数组初始化
$res = array(
'scheme' => '',
'host' => '',
'path' => '',
'name' => '',
'domain' => '',
);
$urlarr = explode('.', $url_parse['host']);
$count = count($urlarr);
$res['scheme'] = $url_parse['scheme'];
$res['host'] = $url_parse['host'];
if ( ! empty($url_parse['path']))
{
$res['path'] = $url_parse['path'];
}
#列举域名中固定元素
$state_domain = array('com', 'edu', 'gov', 'int', 'mil', 'net', 'org', 'biz', 'info', 'pro', 'name', 'coop', 'aero', 'xxx', 'idv', 'mobi', 'cc', 'me', 'jp', 'uk', 'ws', 'eu', 'pw', 'kr', 'io', 'us', 'cn', 'al', 'dz', 'af', 'ar', 'ae', 'aw', 'om', 'az', 'eg', 'et', 'ie', 'ee', 'ad', 'ao', 'ai', 'ag', 'at', 'au', 'mo', 'bb', 'pg', 'bs', 'pk', 'py', 'ps', 'bh', 'pa', 'br', 'by', 'bm', 'bg', 'mp', 'bj', 'be', 'is', 'pr', 'ba', 'pl', 'bo', 'bz', 'bw', 'bt', 'bf', 'bi', 'bv', 'kp', 'gq', 'dk', 'de', 'tl', 'tp', 'tg', 'dm', 'do', 'ru', 'ec', 'er', 'fr', 'fo', 'pf', 'gf', 'tf', 'va', 'ph', 'fj', 'fi', 'cv', 'fk', 'gm', 'cg', 'cd', 'co', 'cr', 'gg', 'gd', 'gl', 'ge', 'cu', 'gp', 'gu', 'gy', 'kz', 'ht', 'nl', 'an', 'hm', 'hn', 'ki', 'dj', 'kg', 'gn', 'gw', 'ca', 'gh', 'ga', 'kh', 'cz', 'zw', 'cm', 'qa', 'ky', 'km', 'ci', 'kw', 'hr', 'ke', 'ck', 'lv', 'ls', 'la', 'lb', 'lt', 'lr', 'ly', 'li', 're', 'lu', 'rw', 'ro', 'mg', 'im', 'mv', 'mt', 'mw', 'my', 'ml', 'mk', 'mh', 'mq', 'yt', 'mu', 'mr', 'um', 'as', 'vi', 'mn', 'ms', 'bd', 'pe', 'fm', 'mm', 'md', 'ma', 'mc', 'mz', 'mx', 'nr', 'np', 'ni', 'ne', 'ng', 'nu', 'no', 'nf', 'na', 'za', 'aq', 'gs', 'pn', 'pt', 'se', 'ch', 'sv', 'yu', 'sl', 'sn', 'cy', 'sc', 'sa', 'cx', 'st', 'sh', 'kn', 'lc', 'sm', 'pm', 'vc', 'lk', 'sk', 'si', 'sj', 'sz', 'sd', 'sr', 'sb', 'so', 'tj', 'tw', 'th', 'tz', 'to', 'tc', 'tt', 'tn', 'tv', 'tr', 'tm', 'tk', 'wf', 'vu', 'gt', 've', 'bn', 'ug', 'ua', 'uy', 'uz', 'es', 'eh', 'gr', 'hk', 'sg', 'nc', 'nz', 'hu', 'sy', 'jm', 'am', 'ac', 'ye', 'iq', 'ir', 'il', 'it', 'in', 'id', 'vg', 'jo', 'vn', 'zm', 'je', 'td', 'gi', 'cl', 'cf', 'yr', 'arpa', 'museum', 'asia', 'ax', 'bl', 'bq', 'cat', 'cw', 'gb', 'jobs', 'mf', 'rs', 'su', 'sx', 'tel', 'travel', 'shop', 'ltd', 'store', 'vip', '网店', '中国', '公司', '网络', 'co.il', 'co.nz', 'co.uk', 'me.uk', 'org.uk', 'com.sb', '在线', '中文网', '移动', 'wang', 'club', 'ren', 'top', 'website', 'cool', 'company', 'city', 'email', 'market', 'software', 'ninja', '我爱你', 'bike', 'today', 'life', 'space', 'pub', 'site', 'help', 'link', 'photo', 'video', 'click', 'pics', 'sexy', 'audio', 'gift', 'tech', '网址', 'online', 'win', 'download', 'party', 'bid', 'loan', 'date', 'trade', 'red', 'blue', 'pink', 'poker', 'green', 'farm', 'zone', 'guru', 'tips', 'land', 'care', 'camp', 'cab', 'cash', 'limo', 'toys', 'tax', 'town', 'fish', 'fund', 'fail', 'house', 'shoes', 'media', 'guide', 'tools', 'solar', 'watch', 'cheap', 'rocks', 'news', 'live', 'lawyer', 'host', 'wiki', 'ink', 'design', 'lol', 'hiphop', 'hosting', 'diet', 'flowers', 'car', 'cars', 'auto', 'mom', 'cq', 'he', 'nm', 'ln', 'jl', 'hl', 'js', 'zj', 'ah', 'jx', 'ha', 'hb', 'gx', 'hi', 'gz', 'yn', 'xz', 'qh', 'nx', 'xj', 'xyz', 'xin', 'science', 'press', 'band', 'engineer', 'social', 'studio', 'work', 'game', 'kim', 'games', 'group', '集团');
if ($count <= 2)
{
#当域名直接根形式不存在host部分直接输出
$last = array_pop($urlarr);
$last_1 = array_pop($urlarr);
if (in_array($last, $state_domain))
{
$res['domain'] = $last_1.'.'.$last;
$res['name'] = $last_1;
$res['tld'] = $last;
}
}
elseif ($count > 2)
{
$last = array_pop($urlarr);
$last_1 = array_pop($urlarr);
$last_2 = array_pop($urlarr);
$res['domain'] = $last_1.'.'.$last; //默认为n.com形式
$res['name'] = $last_2;
//排除非标准 ltd 域名
if ( ! in_array($last, $state_domain))
{
return false;
}
if (in_array($last, $state_domain))
{
$res['domain'] = $last_1.'.'.$last; //n.com形式
$res['name'] = $last_1;
$res['tld'] = $last;
}
//排除顶级根二级后缀
if ($last_1 !== $last and in_array($last_1, $state_domain) and ! in_array($last, array('com', 'net', 'org', 'edu', 'gov')))
{
$res['domain'] = $last_2.'.'.$last_1.'.'.$last; //n.n.com形式
$res['name'] = $last_2;
$res['tld'] = $last_1.'.'.$last;
}
//限定cn顶级根二级后缀为'com', 'net', 'org', 'edu', 'gov'
if (in_array($last, array('cn')) and $last_1 !== $last and strlen($last_1) > 2 and ! in_array($last_1, array('com', 'net', 'org', 'edu', 'gov')))
{
$res['domain'] = $last_1.'.'.$last; //n.n.cn形式
$res['name'] = $last_1;
$res['tld'] = $last;
}
}
//检测和验证返回的是不是域名格式
if ( ! empty($res['domain']) and preg_match('/^([\p{Han}a-zA-Z0-9])+([\p{Han}a-zA-Z0-9\-])*\.[a-zA-Z\.\p{Han}]+$/iu', $res['domain']))
{
if ($type == 'arr')
{
return $res;
}
elseif ($type == 'host')
{
return $res['host'];
}
elseif ($type == 'tld')
{
return $res['tld'];
}
elseif ($type == 'subdomain')
{
return $res['name'];
}
else
{
return $res['domain'];
}
}
else
{
return '';
}
}
}