curl模拟ip和来源进行网站采集的实现方法,附模拟百度抓取

对于限制了ip和来源的网站，使用正常的采集方式是不行的。这里说我的一种方法吧，使用php的curl类实现模拟ip和来源，可以实现采集限制ip和来源的网站。

1.设置页面限制ip和来源访问
比如服务端的server.php


<?php
 
$client_ip = getip();
$referer = getreferer();
 
$allow_ip = '192.168.0.100';
$allow_referer = 'http://www.xxx.cn';
 
if($client_ip==$allow_ip && strpos($referer, $allow_referer)===0){
  echo 'allow access';
}else{
  echo 'deny access';
} 
 
// 获取访问者ip
function getip(){
  if(!empty($_SERVER['HTTP_CLIENT_IP'])){
    $cip = $_SERVER['HTTP_CLIENT_IP'];
  }elseif(!empty($_SERVER['HTTP_X_FORWARDED_FOR'])){
    $cip = $_SERVER['HTTP_X_FORWARDED_FOR'];
  }elseif(!empty($_SERVER['REMOTE_ADDR'])){
    $cip = $_SERVER['REMOTE_ADDR'];
  }else{
    $cip = '';
  }
  return $cip;
}
 
// 获取访问者来源
function getreferer(){
  if(isset($_SERVER['HTTP_REFERER'])){
    return $_SERVER['HTTP_REFERER'];
  }
  return '';
}
 
?>

使用curl正常访问

<?php
function doCurl($url, $data=array(), $header=array(), $timeout=30){
 
  $ch = curl_init();
  curl_setopt($ch, CURLOPT_URL, $url);
  curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
  curl_setopt($ch, CURLOPT_POST, true);
  curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data));
  curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
 
  $response = curl_exec($ch);
 
  if($error=curl_error($ch)){
    die($error);
  }
 
  curl_close($ch);
 
  return $response;
 
}
 
// 调用
$url = 'http://www.xxx.cn/server.php';
$response = doCurl($url);
 
echo $response;
?>

使用curl模拟ip和来源进行访问

模拟来源

curl_setopt($ch, CURLOPT_REFERER, '来源');

模拟ip

curl_setopt($ch, CURLOPT_HTTPHEADER, array('CLIENT-IP: 模拟ip','X-FORWARDED-FOR: 模拟ip'));

完整代码如下：

<?php
function doCurl($url, $data=array(), $header=array(), $referer='', $timeout=30){
 
  $ch = curl_init();
  curl_setopt($ch, CURLOPT_URL, $url);
  curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
  curl_setopt($ch, CURLOPT_POST, true);
  curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data));
  curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
 
  // 模拟来源
  curl_setopt($ch, CURLOPT_REFERER, $referer);
 
  $response = curl_exec($ch);
 
  if($error=curl_error($ch)){
    die($error);
  }
 
  curl_close($ch);
 
  return $response;
 
}
 
// 调用
$url = 'http://www.example.cn/server.php';//外部采集网站
$data = array();
 
// 设置IP
$header = array(
  'CLIENT-IP: 192.168.0.100',
  'X-FORWARDED-FOR: 192.168.0.100'
);
 
// 设置来源
$referer = 'http://www.xxx.cn/';
 
$response = doCurl($url, $data, $header, $referer, 5);
 
echo $response;
?>

附录：模拟百度抓取