// +---------------------------------------------------------------------- // +---------------------------------------------------------------------- // | GET请求 // | requests::get('http://www.test.com'); // | SERVER // | $_GET // +---------------------------------------------------------------------- // | POST请求 // | $data = array('name'=>'request'); // | requests::post('http://www.test.com', $data); // | SERVER // | $_POST // +---------------------------------------------------------------------- // | POST RESTful请求 // | $data = array('name'=>'request'); // | $data_string = json_encode($data); // | requests::set_header("Content-Type", "application/json"); // | requests::post('http://www.test.com', $data_string); // | SERVER // | file_get_contents('php://input') // +---------------------------------------------------------------------- // | POST 文件上传 // | $data = array('file1'=>''./data/phpspider.log''); // | requests::post('http://www.test.com', null, $data); // | SERVER // | $_FILES // +---------------------------------------------------------------------- // | 代理 // | requests::set_proxy(array('223.153.69.150:42354')); // | $html = requests::get('https://www.test.com'); // +---------------------------------------------------------------------- //---------------------------------- // PHPSpider请求类文件 //---------------------------------- namespace phpspider\core; if (!function_exists('curl_file_create')) { function curl_file_create($filename, $mimetype = '', $postname = '') { return "@$filename;filename=" . ($postname ?: basename($filename)) . ($mimetype ? ";type=$mimetype" : ''); } } class requests { const VERSION = '2.0.1'; protected static $ch = null; /**** Public variables ****/ /* user definable vars */ public static $timeout = 15; public static $encoding = null; public static $input_encoding = null; public static $output_encoding = null; public static $cookies = array(); // array of cookies to pass // $cookies['username'] = "seatle"; public static $rawheaders = array(); // array of raw headers to send public static $domain_cookies = array(); // array of cookies for domain to pass public static $hosts = array(); // random host binding for make request faster public static $headers = array(); // headers returned from server sent here public static $useragents = array("requests/2.0.0"); // random agent we masquerade as public static $client_ips = array(); // random ip we masquerade as public static $proxies = array(); // random proxy ip public static $raw = ""; // head + body content returned from server sent here public static $head = ""; // head content public static $content = ""; // The body before encoding public static $text = ""; // The body after encoding public static $info = array(); // curl info public static $history = 302; // http request status before redirect. ex:30x public static $status_code = 0; // http request status public static $error = ""; // error messages sent here /** * set timeout * $timeout 为数组时会分别设置connect和read * * @param init or array $timeout * @return */ public static function set_timeout($timeout) { self::$timeout = $timeout; } /** * 设置代理 * 如果代理有多个,请求时会随机使用 * * @param mixed $proxies * array ( * 'socks5://user1:pass2@host:port', * 'socks5://user2:pass2@host:port' *) * @return void * @author seatle * @created time :2016-09-18 10:17 */ public static function set_proxy($proxy) { self::$proxies = is_array($proxy) ? $proxy : array($proxy); } /** * 删除代理 * 因为每个链接信息里面都有代理信息,有的链接需要,有的不需要,所以必须提供一个删除功能 * * @return void * @author seatle * @created time :2018-07-16 17:59 */ public static function del_proxy() { self::$proxies = array(); } /** * 自定义请求头部 * 请求头内容可以用 requests::$rawheaders 来获取 * 比如获取Content-Type:requests::$rawheaders['Content-Type'] * * @param string $headers * @return void */ public static function set_header($key, $value) { self::$rawheaders[$key] = $value; } /** * 设置全局COOKIE * * @param string $cookie * @return void */ public static function set_cookie($key, $value, $domain = '') { if (empty($key)) { return false; } if (!empty($domain)) { self::$domain_cookies[$domain][$key] = $value; } else { self::$cookies[$key] = $value; } return true; } /** * 批量设置全局cookie * * @param mixed $cookies * @param string $domain * @return void * @author seatle * @created time :2017-08-03 18:06 */ public static function set_cookies($cookies, $domain = '') { $cookies_arr = explode(';', $cookies); if (empty($cookies_arr)) { return false; } foreach ($cookies_arr as $cookie) { $cookie_arr = explode('=', $cookie, 2); $key = $cookie_arr[0]; $value = empty($cookie_arr[1]) ? '' : $cookie_arr[1]; if (!empty($domain)) { self::$domain_cookies[$domain][$key] = $value; } else { self::$cookies[$key] = $value; } } return true; } /** * 获取单一Cookie * * @param mixed $name cookie名称 * @param string $domain 不传则取全局cookie,就是手动set_cookie的cookie * @return void * @author seatle * @created time :2017-08-03 18:06 */ public static function get_cookie($name, $domain = '') { if (!empty($domain) && !isset(self::$domain_cookies[$domain])) { return ''; } $cookies = empty($domain) ? self::$cookies : self::$domain_cookies[$domain]; return isset($cookies[$name]) ? $cookies[$name] : ''; } /** * 获取Cookie数组 * * @param string $domain 不传则取全局cookie,就是手动set_cookie的cookie * @return void * @author seatle * @created time :2017-08-03 18:06 */ public static function get_cookies($domain = '') { if (!empty($domain) && !isset(self::$domain_cookies[$domain])) { return array(); } return empty($domain) ? self::$cookies : self::$domain_cookies[$domain]; } /** * 删除Cookie * * @param string $domain 不传则删除全局Cookie * @return void * @author seatle * @created time :2017-08-03 18:06 */ public static function del_cookie($key, $domain = '') { if (empty($key)) { return false; } if (!empty($domain) && !isset(self::$domain_cookies[$domain])) { return false; } if (!empty($domain)) { if (isset(self::$domain_cookies[$domain][$key])) { unset(self::$domain_cookies[$domain][$key]); } } else { if (isset(self::$cookies[$key])) { unset(self::$cookies[$key]); } } return true; } /** * 删除Cookie * * @param string $domain 不传则删除全局Cookie * @return void * @author seatle * @created time :2017-08-03 18:06 */ public static function del_cookies($domain = '') { if (!empty($domain) && !isset(self::$domain_cookies[$domain])) { return false; } if ( empty($domain) ) { self::$cookies = array(); } else { if (isset(self::$domain_cookies[$domain])) { unset(self::$domain_cookies[$domain]); } } return true; } /** * 设置随机的user_agent * * @param string $useragent * @return void */ public static function set_useragent($useragent) { self::$useragents = is_array($useragent) ? $useragent : array($useragent); } /** * set referer * */ public static function set_referer($referer) { self::$rawheaders['Referer'] = $referer; } /** * 设置伪造IP * 传入数组则为随机IP * @param string $ip * @return void */ public static function set_client_ip($ip) { self::$client_ips = is_array($ip) ? $ip : array($ip); } /** * 删除伪造IP * * @return void * @author seatle * @created time :2018-07-16 17:59 */ public static function del_client_ip() { self::$client_ips = array(); } /** * 设置中文请求 * * @param string $lang * @return void */ public static function set_accept_language($lang = 'zh-CN') { self::$rawheaders['Accept-Language'] = $lang; } /** * 设置Hosts * 负载均衡到不同的服务器,如果对方使用CDN,采用这个是最好的了 * * @param string $hosts * @return void */ public static function set_hosts($host, $ips = array()) { $ips = is_array($ips) ? $ips : array($ips); self::$hosts[$host] = $ips; } /** * 分割返回的header和body * header用来判断编码和获取Cookie * body用来判断编码,得到编码前和编码后的内容 * * @return void * @author seatle * @created time :2017-08-03 18:06 */ public static function split_header_body() { $head = $body = ''; $head = substr(self::$raw, 0, self::$info['header_size']); $body = substr(self::$raw, self::$info['header_size']); // http header self::$head = $head; // The body before encoding self::$content = $body; //$http_headers = array(); //// 解析HTTP数据流 //if (!empty(self::$raw)) //{ //self::get_response_cookies($domain); //// body里面可能有 \r\n\r\n,但是第一个一定是HTTP Header,去掉后剩下的就是body //$array = explode("\r\n\r\n", self::$raw); //foreach ($array as $k=>$v) //{ //// post 方法会有两个http header:HTTP/1.1 100 Continue、HTTP/1.1 200 OK //if (preg_match("#^HTTP/.*? 100 Continue#", $v)) //{ //unset($array[$k]); //continue; //} //if (preg_match("#^HTTP/.*? \d+ #", $v)) //{ //$header = $v; //unset($array[$k]); //$http_headers = self::get_response_headers($v); //} //} //$body = implode("\r\n\r\n", $array); //} // 设置了输出编码的转码,注意: xpath只支持utf-8,iso-8859-1 不要转,他本身就是utf-8 $body = self::encoding($body); //自动转码 // 转码后 self::$encoding = self::$output_encoding; // The body after encoding self::$text = $body; return array($head, $body); } /** * 获得域名相对应的Cookie * * @param mixed $header * @param mixed $domain * @return void * @author seatle * @created time :2017-08-03 18:06 */ public static function get_response_cookies($header, $domain) { // 解析Cookie并存入 self::$cookies 方便调用 preg_match_all("/.*?Set\-Cookie: ([^\r\n]*)/i", $header, $matches); $cookies = empty($matches[1]) ? array() : $matches[1]; // 解析到Cookie if (!empty($cookies)) { $cookies = implode(';', $cookies); $cookies = explode(';', $cookies); foreach ($cookies as $cookie) { $cookie_arr = explode('=', $cookie, 2); // 过滤 httponly、secure if (count($cookie_arr) < 2) { continue; } $cookie_name = !empty($cookie_arr[0]) ? trim($cookie_arr[0]) : ''; if (empty($cookie_name)) { continue; } // 过滤掉domain路径 if (in_array(strtolower($cookie_name), array('path', 'domain', 'expires', 'max-age'))) { continue; } self::$domain_cookies[$domain][trim($cookie_arr[0])] = trim($cookie_arr[1]); } } } /** * 获得response header * 此方法占时没有用到 * * @param mixed $header * @return void * @author seatle * @created time :2017-08-03 18:06 */ public static function get_response_headers($header) { $headers = array(); $header_lines = explode("\n", $header); if (!empty($header_lines)) { foreach ($header_lines as $line) { $header_arr = explode(':', $line, 2); $key = empty($header_arr[0]) ? '' : trim($header_arr[0]); $val = empty($header_arr[1]) ? '' : trim($header_arr[1]); if (empty($key) || empty($val)) { continue; } $headers[$key] = $val; } } self::$headers = $headers; return self::$headers; } /** * 获取编码 * @param $string * @return string */ public static function get_encoding($string) { $encoding = mb_detect_encoding($string, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5', 'ISO-8859-1')); return strtolower($encoding); } /** * 移除页面head区域代码 * @param $html * @return mixed */ private static function _remove_head($html) { return preg_replace('/.+<\/head>/is', '', $html); } /** * 简单的判断一下参数是否为一个URL链接 * @param string $str * @return boolean */ private static function _is_url($url) { //$pattern = '/^http(s)?:\\/\\/.+/'; $pattern = "/\b(([\w-]+:\/\/?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|\/)))/"; if (preg_match($pattern, $url)) { return true; } return false; } /** * 初始化 CURL * */ public static function init() { if (!is_resource ( self::$ch )) { self::$ch = curl_init (); curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true ); curl_setopt( self::$ch, CURLOPT_HEADER, false ); curl_setopt( self::$ch, CURLOPT_USERAGENT, "phpspider-requests/".self::VERSION ); // 如果设置了两个时间,就分开设置 if (is_array(self::$timeout)) { curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout[0] ); curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout[1]); } else { curl_setopt(self::$ch, CURLOPT_CONNECTTIMEOUT, ceil(self::$timeout / 2)); curl_setopt(self::$ch, CURLOPT_TIMEOUT, self::$timeout); } curl_setopt(self::$ch, CURLOPT_MAXREDIRS, 5); //maximum number of redirects allowed // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生 curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true); } return self::$ch; } /** * get 请求 */ public static function get($url, $fields = array(), $allow_redirects = true, $cert = NULL) { self::init (); return self::request($url, 'get', $fields, NULL, $allow_redirects, $cert); } /** * post 请求 * $fields 有三种类型:1、数组;2、http query;3、json * 1、array('name'=>'yangzetao') * 2、http_build_query(array('name'=>'yangzetao')) * 3、json_encode(array('name'=>'yangzetao')) * 前两种是普通的post,可以用$_POST方式获取 * 第三种是post stream( json rpc,其实就是webservice ) * 虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取 * * @param mixed $url * @param array $fields * @param mixed $proxies * @static * @access public * @return void */ public static function post($url, $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL) { self::init (); return self::request($url, 'POST', $fields, $files, $allow_redirects, $cert); } public static function put($url, $fields = array(), $allow_redirects = true, $cert = NULL) { self::init (); return self::request($url, 'PUT', $fields, $allow_redirects, $cert); } public static function delete($url, $fields = array(), $allow_redirects = true, $cert = NULL) { self::init (); return self::request($url, 'DELETE', $fields, $allow_redirects, $cert); } // 响应HTTP头域里的元信息 // 此方法被用来获取请求实体的元信息而不需要传输实体主体(entity-body) // 此方法经常被用来测试超文本链接的有效性,可访问性,和最近的改变。. public static function head($url, $fields = array(), $allow_redirects = true, $cert = NULL) { self::init (); self::request($url, 'HEAD', $fields, $allow_redirects, $cert); } public static function options($url, $fields = array(), $allow_redirects = true, $cert = NULL) { self::init (); return self::request($url, 'OPTIONS', $fields, $allow_redirects, $cert); } public static function patch($url, $fields = array(), $allow_redirects = true, $cert = NULL) { self::init (); return self::request($url, 'PATCH', $fields, $allow_redirects, $cert); } /** * request * * @param mixed $url 请求URL * @param string $method 请求方法 * @param array $fields 表单字段 * @param array $files 上传文件 * @param mixed $cert CA证书 * @return void * @author seatle * @created time :2017-08-03 18:06 */ public static function request($url, $method = 'GET', $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL) { $method = strtoupper($method); if(!self::_is_url($url)) { self::$error = "You have requested URL ({$url}) is not a valid HTTP address"; return false; } // 如果是 get 方式,直接拼凑一个 url 出来 if ($method == 'GET' && !empty($fields)) { $url = $url.(strpos($url, '?') === false ? '?' : '&').http_build_query($fields); } $parse_url = parse_url($url); if (empty($parse_url) || empty($parse_url['host']) || !in_array($parse_url['scheme'], array('http', 'https'))) { self::$error = "No connection adapters were found for '{$url}'"; return false; } $scheme = $parse_url['scheme']; $domain = $parse_url['host']; // 随机绑定 hosts,做负载均衡 if (self::$hosts) { if (isset(self::$hosts[$domain])) { $hosts = self::$hosts[$domain]; $key = rand(0, count($hosts)-1); $ip = $hosts[$key]; $url = str_replace($domain, $ip, $url); self::$rawheaders['Host'] = $domain; } } curl_setopt( self::$ch, CURLOPT_URL, $url ); if ($method != 'GET') { // 如果是 post 方式 if ($method == 'POST') { //curl_setopt( self::$ch, CURLOPT_POST, true ); $tmpheaders = array_change_key_case(self::$rawheaders, CASE_LOWER); // 有些RESTful服务只接受JSON形态的数据 // CURLOPT_POST会把上傳的文件类型设为 multipart/form-data // 把CURLOPT_POSTFIELDS的内容按multipart/form-data 的形式编码 // CURLOPT_CUSTOMREQUEST可以按指定内容上传 if ( isset($tmpheaders['content-type']) && $tmpheaders['content-type'] == 'application/json' ) { curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); } else { curl_setopt( self::$ch, CURLOPT_POST, true ); } $file_fields = array(); if (!empty($files)) { foreach ($files as $postname => $file) { $filepath = realpath($file); // 如果文件不存在 if (!file_exists($filepath)) { continue; } $filename = basename($filepath); $type = self::get_mimetype($filepath); $file_fields[$postname] = curl_file_create($filepath, $type, $filename); // curl -F "name=seatle&file=@/absolute/path/to/image.png" htt://localhost/uploadfile.php //$cfile = '@'.realpath($filename).";type=".$type.";filename=".$filename; } } } else { self::$rawheaders['X-HTTP-Method-Override'] = $method; curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); } if ( $method == 'POST' ) { // 不是上传文件的,用http_build_query, 能实现更好的兼容性,更小的请求数据包 if ( empty($file_fields) ) { // post方式 if ( is_array($fields) ) { $fields = http_build_query($fields); } } else { // 有post数据 if ( is_array($fields) && !empty($fields) ) { // 某些server可能会有问题 $fields = array_merge($fields, $file_fields); } else { $fields = $file_fields; } } // 不能直接传数组,不知道是什么Bug,会非常慢 curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields ); } } $cookies = self::get_cookies(); $domain_cookies = self::get_cookies($domain); $cookies = array_merge($cookies, $domain_cookies); // 是否设置了cookie if (!empty($cookies)) { foreach ($cookies as $key=>$value) { $cookie_arr[] = $key.'='.$value; } $cookies = implode('; ', $cookie_arr); curl_setopt(self::$ch, CURLOPT_COOKIE, $cookies); } if (!empty(self::$useragents)) { $key = rand(0, count(self::$useragents) - 1); self::$rawheaders['User-Agent'] = self::$useragents[$key]; } if (!empty(self::$client_ips)) { $key = rand(0, count(self::$client_ips) - 1); self::$rawheaders['CLIENT-IP'] = self::$client_ips[$key]; self::$rawheaders['X-FORWARDED-FOR'] = self::$client_ips[$key]; } if (self::$rawheaders) { $http_headers = array(); foreach (self::$rawheaders as $k=>$v) { $http_headers[] = $k.': '.$v; } curl_setopt( self::$ch, CURLOPT_HTTPHEADER, $http_headers ); } curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' ); // 关闭验证 if ($scheme == 'https') { curl_setopt(self::$ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt(self::$ch, CURLOPT_SSL_VERIFYHOST, false); } if (self::$proxies) { $key = rand(0, count(self::$proxies) - 1); $proxy = self::$proxies[$key]; curl_setopt( self::$ch, CURLOPT_PROXY, $proxy ); } // header + body,header 里面有 cookie curl_setopt( self::$ch, CURLOPT_HEADER, true ); // 请求跳转后的内容 if ($allow_redirects) { curl_setopt( self::$ch, CURLOPT_FOLLOWLOCATION, true); } self::$raw = curl_exec ( self::$ch ); // 真实url //$location = curl_getinfo( self::$ch, CURLINFO_EFFECTIVE_URL); self::$info = curl_getinfo( self::$ch ); //print_r(self::$info); self::$status_code = self::$info['http_code']; if (self::$raw === false) { self::$error = 'Curl error: ' . curl_error( self::$ch ); //trigger_error(self::$error, E_USER_WARNING); } // 关闭句柄 curl_close( self::$ch ); // 请求成功之后才把URL存起来 list($header, $text) = self::split_header_body(); self::$history = self::get_history($header); self::$headers = self::get_response_headers($header); self::get_response_cookies($header, $domain); //$data = substr($data, 10); //$data = gzinflate($data); return $text; } public static function get_history($header) { $status_code = 0; $lines = explode("\n", $header); foreach ($lines as $line) { $line = trim($line); if (preg_match("#^HTTP/.*? (\d+) Found#", $line, $out)) { $status_code = empty($out[1]) ? 0 : intval($out[1]); } } return $status_code; } // 获取 mimetype public static function get_mimetype($filepath) { $fp = finfo_open(FILEINFO_MIME); $mime = finfo_file($fp, $filepath); finfo_close($fp); $arr = explode(';', $mime); $type = empty($arr[0]) ? '' : $arr[0]; return $type; } /** * 拼凑文件和表单 * 占时没有用到 * * @param mixed $post_fields * @param mixed $file_fields * @return void * @author seatle * @created time :2017-08-03 18:06 */ public static function get_postfile_form($post_fields, $file_fields) { // 构造post数据 $data = ''; $delimiter = '-------------' . uniqid(); // 表单数据 foreach ($post_fields as $name => $content) { $data .= '--'.$delimiter."\r\n"; $data .= 'Content-Disposition: form-data; name = "'.$name.'"'; $data .= "\r\n\r\n"; $data .= $content; $data .= "\r\n"; } foreach ($file_fields as $input_name => $file) { $data .= '--'.$delimiter."\r\n"; $data .= 'Content-Disposition: form-data; name = "'.$input_name.'";'. ' filename="'.$file['filename'].'"'."\r\n"; $data .= "Content-Type: {$file['type']}\r\n"; $data .= "\r\n"; $data .= $file['content']; $data .= "\r\n"; } // 结束符 $data .= '--'.$delimiter."--\r\n"; //return array( //CURLOPT_HTTPHEADER => array( //'Content-Type:multipart/form-data;boundary=' . $delimiter, //'Content-Length:' . strlen($data) //), //CURLOPT_POST => true, //CURLOPT_POSTFIELDS => $data, //); return array($delimiter, $data); } /** * html encoding transform * * @param string $html * @param string $in * @param string $out * @param string $content * @param string $mode * auto|iconv|mb_convert_encoding * @return string */ public static function encoding($html, $in = null, $out = null, $mode = 'auto') { $valid = array( 'auto', 'iconv', 'mb_convert_encoding', ); if (isset(self::$output_encoding)) { $out = self::$output_encoding; } if ( ! isset($out)) { $out = 'UTF-8'; } if ( ! in_array($mode, $valid)) { throw new Exception('invalid mode, mode='.$mode); } $if = function_exists('mb_convert_encoding'); $if = $if && ($mode == 'auto' || $mode == 'mb_convert_encoding'); if (function_exists('iconv') && ($mode == 'auto' || $mode == 'iconv')) { $func = 'iconv'; } elseif ($if) { $func = 'mb_convert_encoding'; } else { throw new Exception('charsetTrans failed, no function'); } $pattern = '/(]*?charset=([\"\']?))([a-z\d_\-]*)(\2[^>]*?>)/is'; if ( ! isset($in)) { $n = preg_match($pattern, $html, $in); if ($n > 0) { $in = $in[3]; } else { $in = null; } if (empty($in) and function_exists('mb_detect_encoding')) { $in = mb_detect_encoding($html, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5', 'ISO-8859-1')); } } if (isset($in)) { if ($in == 'ISO-8859-1') { $in = 'UTF-8'; } $old = error_reporting(error_reporting() & ~E_NOTICE); $html = call_user_func($func, $in, $out.'//IGNORE', $html); error_reporting($old); $html = preg_replace($pattern, "\\1$out\\4", $html, 1); } return $html; } }