IYUU自动辅种工具初始化版本

2026-03-03 18:05:26 +00:00 · 2019-12-22 00:00:00 +08:00
commit e81d66869a
133 changed files with 33246 additions and 0 deletions
--- a/vendor/owner888/phpspider/library/cls_query.php
+++ b/vendor/owner888/phpspider/library/cls_query.php
@@ -0,0 +1,248 @@
+<?php
+class cls_query
+{
+    private static $content;
+    public static $debug = false;
+
+    public static function init($content)
+    {
+        self::$content = $content;
+    }
+
+    public static function query($query, $attr = "html")
+    {
+        $nodes = self::get_nodes($query);
+        $datas = self::get_datas($nodes, $attr);
+        return $datas;
+    }
+
+    protected static function is_char($char) {
+		return extension_loaded('mbstring') ? mb_eregi('\w', $char) : preg_match('@\w@', $char);
+	}
+
+    /**
+     * 从xpath中得到节点
+     * 
+     * @param mixed $xpath
+     * @return void
+     * @author seatle <seatle@foxmail.com> 
+     * @created time :2015-08-08 15:52
+     */
+    private static function get_nodes($query)
+    {
+        // 把一到多个空格 替换成 一个空格
+        // 把 > 和  ~ 符号两边的空格去掉，因为没有用这两个符号，所以这里可以不这么做
+        // ul>li.className
+        $query = trim(
+			preg_replace('@\s+@', ' ',
+				preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query)
+			)
+		);
+
+        $nodes = array();
+		if (! $query)
+        {
+			return $nodes;
+        }
+
+        $query_arr = explode(" ", $query);
+        foreach ($query_arr as $k=>$v) 
+        {
+            $path = $k == 0 ? $v : $path.' '.$v;
+            $node = array("path"=>(string)$path, "name"=>"", "id"=>"", "class"=>"", "other"=>array());
+            // 如果存在内容选择器
+            if (preg_match('@(.*?)\[(.*?)=[\'|"](.*?)[\'|"]\]@', $v, $matches) && !empty($matches[2]) && !empty($matches[3])) 
+            {
+                // 把选择器过滤掉 [rel='topic']
+                $v = $matches[1];
+                $node['other'] = array(
+                    'key'=>$matches[2],
+                    'val'=>$matches[3],
+                );
+            }
+
+            // 如果存在 id
+            $id_arr = explode("#", $v);
+            $class_arr = explode(".", $v);
+            if (count($id_arr) === 2) 
+            {
+                $node['name'] = $id_arr[0];
+                $node['id'] = $id_arr[1];
+            }
+            // 如果存在 class
+            elseif (count($class_arr) === 2) 
+            {
+                $node['name'] = $class_arr[0];
+                $node['class'] = $class_arr[1];
+            }
+            // 如果没有样式
+            else 
+            {
+                $node['name'] = $v;
+            }
+            $nodes[] = $node;
+        }
+        //print_r($nodes);
+        //exit;
+        return $nodes;
+    }
+
+    public static function get_datas($nodes, $attr = "html")
+    {
+        if (empty(self::$content)) 
+        {
+            return false;
+        }
+
+        $node_datas = array();
+        $count = count($nodes);
+        // 循环所有节点
+        foreach ($nodes as $i=>$node) 
+        {
+            $is_last = $count == $i+1 ? true : false;
+            // 第一次
+            if ($i == 0) 
+            {
+                $datas = array();
+                $datas = self::get_node_datas($node, self::$content, $attr, $is_last);
+                // 如果第一次都取不到数据，直接跳出循环
+                if(!$datas)
+                {
+                    break;
+                }
+                $node_datas[$nodes[$i]['path']] = $datas;
+            }
+            else 
+            {
+                $datas = array();
+                // 循环上一个节点的数组
+                foreach ($node_datas[$nodes[$i-1]['path']] as $v) 
+                {
+                    $datas = array_merge( $datas, self::get_node_datas($node, trim($v), $attr, $is_last) );
+                }
+                $node_datas[$nodes[$i]['path']] = $datas;
+                // 删除上一个节点，防止内存溢出，或者缓存到本地，再次使用？！
+                unset($node_datas[$nodes[$i-1]['path']]);
+            }
+        }   
+        //print_r($datas);exit;
+        // 从数组中弹出最后一个元素
+        $node_datas = array_pop($node_datas);
+        //print_r($node_datas);
+        //exit;
+        return $node_datas;
+    }
+
+    /**
+     * 从节点中获取内容
+     * $regex = '@<meta[^>]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i';
+     * 
+     * @param mixed $node
+     * @param mixed $content
+     * @return void
+     * @author seatle <seatle@foxmail.com> 
+     * @created time :2015-08-08 15:52
+     */
+    private static function get_node_datas($node, $content, $attr = "html", $is_last = false)
+    {
+        $node_datas = $datas = array();
+
+        if (!empty($node['id'])) 
+        {
+            if ($node['name']) 
+                $regex = '@<'.$node['name'].'[^>]+id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)</'.$node['name'].'>@is';
+            else 
+                $regex = '@id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)<@is';
+        }
+        elseif (!empty($node['class'])) 
+        {
+            if ($node['name']) 
+                $regex = '@<'.$node['name'].'[^>]+class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)</'.$node['name'].'>@is';
+            else 
+                $regex = '@class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)<@is';
+        }
+        else 
+        {
+            // 这里为是么是*，0次到多次，因为有可能是 <li>
+            $regex = '@<'.$node['name'].'[^>]*?>(.*?)</'.$node['name'].'>@is';
+        }
+        self::log("regex --- " . $regex);;
+        preg_match_all($regex, $content, $matches);
+        $all_datas = empty($matches[0]) ? array() : $matches[0];
+        $html_datas = empty($matches[1]) ? array() : $matches[1];
+
+        // 过滤掉选择器对不上的
+        foreach ($all_datas as $i=>$data) 
+        {
+            // 如果有设置其他选择器，验证一下选择器
+            if (!empty($node['other'])) 
+            {
+                $regex = '@'.$node['other']['key'].'=[\'|"]'.$node['other']['val'].'[\'|"]@is';
+                self::log("regex other --- " . $regex);
+                // 过滤器对不上的，跳过
+                if (!preg_match($regex, $data, $matches)) 
+                {
+                    continue;
+                }
+            }
+            // 获取节点的html内容
+            if ($attr != "html" && $is_last) 
+            {
+                $regex = '@'.$attr.'=[\'|"](.*?)[\'|"]@is';
+                preg_match($regex, $data, $matches); 
+                $node_datas[] = empty($matches[1]) ? '' : trim($matches[1]);
+            }
+            // 获取节点属性名的值
+            else 
+            {
+                $node_datas[] = trim($html_datas[$i]);
+            }
+        }
+        //echo " 11111 ========================================= \n";
+        //print_r($node_datas);
+        //echo " 22222 ========================================= \n\n\n";
+        return $node_datas;
+    }
+
+    /**
+     * 记录日志
+     * @param string $msg
+     * @return void
+     */
+    private static function log($msg)
+    {
+        $msg = "[".date("Y-m-d H:i:s")."] " . $msg . "\n";
+        if (self::$debug) 
+        {
+            echo $msg;
+        }
+    }
+
+}
+
+//$xpath = "ul.top-nav-dropdown li";
+//$xpath = "i.zg-icon";
+//print_r($nodes);
+//exit;
+// [^>]+ 不是>的字符重复一次到多次, ? 表示不贪婪
+// \s 表示空白字符
+// * 表示0次或者多次
+// + 表示1次或者多次
+//
+// 后向引用，表示表达式中，从左往右数，第一个左括号对应的括号内的内容。
+// \\0 表示整个表达式
+// \\1表示第1个表达式
+// \\2表示第2个表达式
+// $regex = '@<meta[^>]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i';
+//preg_match_all($regex, $content, $matches);
+//print_r($matches);
+//exit;
+
+// 用法
+//$content = file_get_contents("./test.html");
+//$query = "ul#top-nav-profile-dropdown li a";
+//$query = "div#zh-profile-following-topic a.link[href='/topic/19550937']";
+//cls_query::init($content);
+//$list = cls_query::query($query, "href");
+//print_r($list);
+