优化彩虹岛副标题抓取

This commit is contained in:
iyuu.cn
2020-01-07 13:09:31 +08:00
parent f8693950ed
commit 7564c30d1c

View File

@@ -91,10 +91,12 @@ class Chdbits implements decodeBase
self::init(); self::init();
Rpc::init(self::SITE, self::METHOD); Rpc::init(self::SITE, self::METHOD);
$html = self::get($url); $html = self::get($url);
#p($html);exit;
if ( $html === null ) { if ( $html === null ) {
exit(1); exit(1);
} }
$data = self::decode($html); $data = self::decode($html);
#p($data);exit;
Rpc::call($data); Rpc::call($data);
exit(0); exit(0);
} }
@@ -150,7 +152,7 @@ class Chdbits implements decodeBase
// 获取副标题(倒序算法) // 获取副标题(倒序算法)
// 偏移量 // 偏移量
$h2StrStart = '<br />'; $h2StrStart = '<br />';
$h2StrEnd = '</td><td width="50" class="embedded"'; $h2StrEnd = '</td><td width="';
$h2_endOffset = strpos($v,$h2StrEnd); $h2_endOffset = strpos($v,$h2StrEnd);
$temp = substr($v, 0, $h2_endOffset); $temp = substr($v, 0, $h2_endOffset);
$h2_offset = strrpos($temp,$h2StrStart); $h2_offset = strrpos($temp,$h2StrStart);
@@ -160,15 +162,20 @@ class Chdbits implements decodeBase
$h2_len = strlen($temp) - $h2_offset - strlen($h2StrStart); $h2_len = strlen($temp) - $h2_offset - strlen($h2StrStart);
//存在副标题 //存在副标题
$titleTemp = substr($temp, $h2_offset + strlen($h2StrStart), $h2_len); $titleTemp = substr($temp, $h2_offset + strlen($h2StrStart), $h2_len);
// 第二次过滤
// 精确适配标签 begin // 精确适配标签 begin
$titleSpan = ''; if ( strpos($titleTemp,'</font>') != false ) {
$titleTemp = selector::select($titleTemp, '//font');
}
$title = selector::remove($titleTemp, "//div"); $title = selector::remove($titleTemp, "//div");
$title = substr($title, 4);
$title = str_replace(" ",'',$title);
$span = array();
$titleSpan = '';
$span = selector::select($titleTemp, '//div'); $span = selector::select($titleTemp, '//div');
if(!empty($span)){ if(!empty($span)){
if(is_array($span)){ if(is_array($span)){
foreach ( $span as $vv ){ foreach ( $span as $vv ){
if( empty($vv) || (strpos($titleTemp,'<div') != false) ){ if( empty($vv) || (strpos($vv,'</div>') != false) ){
continue; continue;
} }
$titleSpan.='['.$vv.'] '; $titleSpan.='['.$vv.'] ';
@@ -178,13 +185,6 @@ class Chdbits implements decodeBase
} }
} }
// 精确适配标签 end // 精确适配标签 end
// 最后过滤
if ( strpos($title,'<font') != false ) {
$offset = 0;
$offset = strpos($title,'>')+1;
$title = substr($title, $offset);
}
$title = str_replace('</font>',"",$title);
$arr['title'] = $titleSpan . $title; $arr['title'] = $titleSpan . $title;
} }