<?php
define('PRE_DOMAIN','www');
define('DOMAIN','sina.com.cn');
define('PROTOCOL','https');
define('ROOT',PROTOCOL.'://'.PRE_DOMAIN.'.'.DOMAIN.'/');
foreach (spider() as $key => $value) {
echo $value."\r\n";
}
function spider(){
$headers=array(
'user-agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
);
$oUrls=parseURL(get(ROOT,$headers));
$result=array();
$queue=array();
foreach($oUrls as $u){
$result[$u]=true;
array_push($queue,$u);
while(!empty($queue)){
$v=array_pop($queue);
$temp=parseURL(get($v,$headers));
foreach($temp as $j){
if(!isset($result[$j])){
yield $j;
$result[$j]=true;
array_push($queue,$j);
}
}
}
}
}
function get($url,$header=null){
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);
if (!empty($header)){
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
}
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 3);
curl_setopt($curl, CURLOPT_TIMEOUT, 10);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$output = curl_exec($curl);
$h = curl_getinfo($curl);
if(!empty($h) && $h['http_code']==200 && stripos($h['content_type'],'text/html')===false){
return "";
}
curl_close($curl);
return $output;
}
function parseURL($content){
preg_match_all('/<a.*href=["\']([^"\'>]*)["\'].*>/', $content,$matchs);
if(empty($matchs[1])) return array();
$match=$matchs[1];
foreach ($match as $key => $value) {
$flag=false;
if(stripos($value, 'http')!==false && stripos($value,DOMAIN)===false){
$flag=true;
}
if(stripos($value, '//')===0 && stripos($value,DOMAIN)!==false){
$match[$key]='https:'.$value;
continue;
}
if(stripos($value, '//')===0 && stripos($value,DOMAIN)===false){
$flag=true;
}
if(stripos($value, 'javascript')===0||stripos($value, '#')===0){
$flag=true;
}
if($flag){
unset($match[$key]);
continue;
}
if(stripos($value,DOMAIN)!==false){
continue;
}
$match[$key]=ROOT.trim($value,'/');
}
return $match;
}