curl模拟ip和来源进行网站采集的实现方法,附模拟百度抓取

来源:www.cnblogs.com 更新时间:2023-05-25 21:55

对于限制了ip和来源的网站,使用正常的采集方式是不行的。这里说我的一种方法吧,使用php的curl类实现模拟ip和来源,可以实现采集限制ip和来源的网站。

1.设置页面限制ip和来源访问
比如服务端的server.php


复制代码<?php
 
$client_ip = getip();
$referer = getreferer();
 
$allow_ip = '192.168.0.100';
$allow_referer = 'http://www.xxx.cn';
 
if($client_ip==$allow_ip && strpos($referer, $allow_referer)===0){
  echo 'allow access';
}else{
  echo 'deny access';
} 
 
// 获取访问者ip
function getip(){
  if(!empty($_SERVER['HTTP_CLIENT_IP'])){
    $cip = $_SERVER['HTTP_CLIENT_IP'];
  }elseif(!empty($_SERVER['HTTP_X_FORWARDED_FOR'])){
    $cip = $_SERVER['HTTP_X_FORWARDED_FOR'];
  }elseif(!empty($_SERVER['REMOTE_ADDR'])){
    $cip = $_SERVER['REMOTE_ADDR'];
  }else{
    $cip = '';
  }
  return $cip;
}
 
// 获取访问者来源
function getreferer(){
  if(isset($_SERVER['HTTP_REFERER'])){
    return $_SERVER['HTTP_REFERER'];
  }
  return '';
}
 
?>

使用curl正常访问

复制代码
<?php
function doCurl($url, $data=array(), $header=array(), $timeout=30){
 
  $ch = curl_init();
  curl_setopt($ch, CURLOPT_URL, $url);
  curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
  curl_setopt($ch, CURLOPT_POST, true);
  curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data));
  curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
 
  $response = curl_exec($ch);
 
  if($error=curl_error($ch)){
    die($error);
  }
 
  curl_close($ch);
 
  return $response;
 
}
 
// 调用
$url = 'http://www.xxx.cn/server.php';
$response = doCurl($url);
 
echo $response;
?>
复制代码

使用curl模拟ip和来源进行访问

模拟来源

curl_setopt($ch, CURLOPT_REFERER, '来源');

模拟ip

curl_setopt($ch, CURLOPT_HTTPHEADER, array('CLIENT-IP: 模拟ip','X-FORWARDED-FOR: 模拟ip'));

完整代码如下:

复制代码
<?php
function doCurl($url, $data=array(), $header=array(), $referer='', $timeout=30){
 
  $ch = curl_init();
  curl_setopt($ch, CURLOPT_URL, $url);
  curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
  curl_setopt($ch, CURLOPT_POST, true);
  curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data));
  curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
 
  // 模拟来源
  curl_setopt($ch, CURLOPT_REFERER, $referer);
 
  $response = curl_exec($ch);
 
  if($error=curl_error($ch)){
    die($error);
  }
 
  curl_close($ch);
 
  return $response;
 
}
 
// 调用
$url = 'http://www.example.cn/server.php';//外部采集网站
$data = array();
 
// 设置IP
$header = array(
  'CLIENT-IP: 192.168.0.100',
  'X-FORWARDED-FOR: 192.168.0.100'
);
 
// 设置来源
$referer = 'http://www.xxx.cn/';
 
$response = doCurl($url, $data, $header, $referer, 5);
 
echo $response;
?>


附录:模拟百度抓取
 


function bd_crul_get($filepath) {
    $filepath=trim($filepath);
    $ishttp=0;
    if(!strstr($filepath,'://') || !eToCheckIsUrl2($filepath))
    {
        return "";
    }
    
    $ch = curl_init();
    $ip = '220.181.108.' .rand(1, 255); // 百度蜘蛛
    $timeout = 15;
    curl_setopt($ch, CURLOPT_URL, $filepath);
    curl_setopt($ch, CURLOPT_TIMEOUT, 0);
    // 伪造百度蜘蛛IP
    curl_setopt($ch, CURLOPT_HTTPHEADER, array(
        'X-FORWARDED-FOR:' .$ip .'',
        'CLIENT-IP:' .$ip .''
    ));
    // 伪造百度蜘蛛头部
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    $content = curl_exec($ch);
    
    if($content ===false) { // 输出错误信息
        $no = curl_errno($ch);
        switch (trim($no)) {
            case 28:
                $error = '访问目标地址超时';
                break;
            default:
                $error = curl_error($ch);
                break;
        }
        echo $error;
    } else {
        $succ = true;
        return $content;
    }
}