curl模拟ip和来源进行网站采集的实现方法,附模拟百度抓取
来源:www.cnblogs.com 更新时间:2023-05-25 21:55
对于限制了ip和来源的网站,使用正常的采集方式是不行的。这里说我的一种方法吧,使用php的curl类实现模拟ip和来源,可以实现采集限制ip和来源的网站。
1.设置页面限制ip和来源访问
比如服务端的server.php
<?php $client_ip = getip(); $referer = getreferer(); $allow_ip = '192.168.0.100'; $allow_referer = 'http://www.xxx.cn'; if($client_ip==$allow_ip && strpos($referer, $allow_referer)===0){ echo 'allow access'; }else{ echo 'deny access'; } // 获取访问者ip function getip(){ if(!empty($_SERVER['HTTP_CLIENT_IP'])){ $cip = $_SERVER['HTTP_CLIENT_IP']; }elseif(!empty($_SERVER['HTTP_X_FORWARDED_FOR'])){ $cip = $_SERVER['HTTP_X_FORWARDED_FOR']; }elseif(!empty($_SERVER['REMOTE_ADDR'])){ $cip = $_SERVER['REMOTE_ADDR']; }else{ $cip = ''; } return $cip; } // 获取访问者来源 function getreferer(){ if(isset($_SERVER['HTTP_REFERER'])){ return $_SERVER['HTTP_REFERER']; } return ''; } ?>
使用curl正常访问
<?php function doCurl($url, $data=array(), $header=array(), $timeout=30){ $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HTTPHEADER, $header); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data)); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); $response = curl_exec($ch); if($error=curl_error($ch)){ die($error); } curl_close($ch); return $response; } // 调用 $url = 'http://www.xxx.cn/server.php'; $response = doCurl($url); echo $response; ?>
使用curl模拟ip和来源进行访问
模拟来源
curl_setopt($ch, CURLOPT_REFERER, '来源');
模拟ip
curl_setopt($ch, CURLOPT_HTTPHEADER, array('CLIENT-IP: 模拟ip','X-FORWARDED-FOR: 模拟ip'));
完整代码如下:
<?php function doCurl($url, $data=array(), $header=array(), $referer='', $timeout=30){ $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HTTPHEADER, $header); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data)); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); // 模拟来源 curl_setopt($ch, CURLOPT_REFERER, $referer); $response = curl_exec($ch); if($error=curl_error($ch)){ die($error); } curl_close($ch); return $response; } // 调用 $url = 'http://www.example.cn/server.php';//外部采集网站 $data = array(); // 设置IP $header = array( 'CLIENT-IP: 192.168.0.100', 'X-FORWARDED-FOR: 192.168.0.100' ); // 设置来源 $referer = 'http://www.xxx.cn/'; $response = doCurl($url, $data, $header, $referer, 5); echo $response; ?>
附录:模拟百度抓取
function bd_crul_get($filepath) { $filepath=trim($filepath); $ishttp=0; if(!strstr($filepath,'://') || !eToCheckIsUrl2($filepath)) { return ""; } $ch = curl_init(); $ip = '220.181.108.' .rand(1, 255); // 百度蜘蛛 $timeout = 15; curl_setopt($ch, CURLOPT_URL, $filepath); curl_setopt($ch, CURLOPT_TIMEOUT, 0); // 伪造百度蜘蛛IP curl_setopt($ch, CURLOPT_HTTPHEADER, array( 'X-FORWARDED-FOR:' .$ip .'', 'CLIENT-IP:' .$ip .'' )); // 伪造百度蜘蛛头部 curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); $content = curl_exec($ch); if($content ===false) { // 输出错误信息 $no = curl_errno($ch); switch (trim($no)) { case 28: $error = '访问目标地址超时'; break; default: $error = curl_error($ch); break; } echo $error; } else { $succ = true; return $content; } } |
上一篇:php flock的作用详细分析 下一篇:PHP实现用户异地登录提醒