PHP采集系统
浏览量:666 | 分类:PHP | 发布日期:2009-08-13
今天公司PHP牛人教了PHP采集系统的原理^_^,太牛了!
代码如下
-
<?php
-
-
//获得网页内容
-
function getFileContents($url) {
-
$user_agent="User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Windows 2000; Windows XP)";
-
$urlparts = parse_url($url);
-
$path = $urlparts['path'];
-
$host = $urlparts['host'];
-
if (!empty($urlparts['query']))
-
$path .= "?".$urlparts['query'];
-
if (isset ($urlparts['port'])) {
-
$port = (int) $urlparts['port'];
-
} else
-
if ($urlparts['scheme'] == "http") {
-
$port = 80;
-
} else
-
if ($urlparts['scheme'] == "https") {
-
$port = 443;
-
}
-
-
if ($port == 80) {
-
$portq = "";
-
} else {
-
$portq = ":$port";
-
}
-
-
$all = "*/*";
-
-
$request = "GET $path HTTP/1.0rnHost: $host$portqrnAccept: $allrnAccept-Encoding: identityrnUser-Agent: $user_agentrnrn";
-
-
$fsocket_timeout = 60;
-
if (substr($url, 0, 5) == "https") {
-
$target = "ssl://".$host;
-
} else {
-
$target = $host;
-
}
-
-
-
$errno = 0;
-
$errstr = "";
-
$fp = @ fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
-
if (!$fp) {
-
$contents['state'] = "NOHOST";
-
print "Error: $errstr";
-
return $contents;
-
} else {
-
if (!fputs($fp, $request)) {
-
$contents['state'] = "Cannot send request";
-
return $contents;
-
}
-
$data = null;
-
socket_set_timeout($fp, $fsocket_timeout);
-
$status = socket_get_status($fp);
-
while (!feof($fp) && !$status['timed_out']) {
-
$data .= fgets($fp, 8192);
-
}
-
fclose($fp);
-
if ($status['timed_out'] == 1) {
-
$contents['state'] = "timeout";
-
} else{
-
if(strstr($data,"Location: ")&&strstr($data,"Cache-Control: private")){
-
$contents['state'] = "jump";
-
$contents['file'] = substr($data, strpos($data, "rnrn") + 4);
-
}
-
else{
-
$contents['state'] = "ok";
-
$contents['file'] = substr($data, strpos($data, "rnrn") + 4);
-
}
-
}
-
}
-
-
return $contents;
-
}
-
-
/*
-
检查url文件是否可以读取
-
check if file is available and in readable form
-
*/
-
function url_status($url) {
-
$user_agent="User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Windows 2000; Windows XP)";
-
$urlparts = parse_url($url);
-
$path = $urlparts['path'];
-
$host = $urlparts['host'];
-
if (!empty($urlparts['query']))
-
$path .= "?".$urlparts['query'];
-
-
if (isset ($urlparts['port'])) {
-
$port = (int) $urlparts['port'];
-
} else
-
if ($urlparts['scheme'] == "http") {
-
$port = 80;
-
} else
-
if ($urlparts['scheme'] == "https") {
-
$port = 443;
-
}
-
-
if ($port == 80) {
-
$portq = "";
-
} else {
-
$portq = ":$port";
-
}
-
-
$all = "*/*"; //just to prevent "comment effect" in get accept
-
$request = "HEAD $path HTTP/1.1rnHost: $host$portqrnAccept: $allrnAccept-Charset: iso-8859-1rnAccept-Encoding: identityrnUser-Agent: $user_agentrnrn";
-
-
if (substr($url, 0, 5) == "https") {
-
$target = "ssl://".$host;
-
} else {
-
$target = $host;
-
}
-
-
$fsocket_timeout = 60;
-
$errno = 0;
-
$errstr = "";
-
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
-
-
$linkstate = "ok";
-
if (!$fp) {
-
$status['state'] = "NOHOST";
-
} else {
-
socket_set_timeout($fp, $fsocket_timeout);
-
fputs($fp, $request);
-
$answer = fgets($fp, 4096);
-
$regs = Array ();
-
if (ereg("HTTP/[0-9.]+ (([0-9])[0-9]{2})", $answer, $regs)) {
-
$httpcode = $regs[2];
-
$full_httpcode = $regs[1];
-
-
if ($httpcode <> 2 && $httpcode <> 3) {
-
$status['state'] = "Unreachable: http $full_httpcode";
-
$linkstate = "Unreachable";
-
}
-
}
-
-
if ($linkstate <> "Unreachable") {
-
while ($answer) {
-
$answer = fgets($fp, 4096);
-
-
if (ereg("Location: *([^nr ]+)", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) {
-
$status['path'] = $regs[1];
-
$status['state'] = "Relocation: http $full_httpcode";
-
fclose($fp);
-
return $status;
-
}
-
-
if (eregi("Last-Modified: *([a-z0-9,: ]+)", $answer, $regs)) {
-
$status['date'] = $regs[1];
-
}
-
-
if (eregi("Content-Type:", $answer)) {
-
$content = $answer;
-
$answer = '';
-
break;
-
}
-
}
-
$socket_status = socket_get_status($fp);
-
if (eregi("Content-Type: *([a-z/]*)", $content, $regs)) {
-
if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
-
$status['content'] = 'text';
-
$status['state'] = 'ok';
-
} else if ($regs[1] == 'application/pdf') {
-
$status['content'] = 'pdf';
-
$status['state'] = 'ok';
-
} else if ($regs[1] == 'application/msword') {
-
$status['content'] = 'doc';
-
$status['state'] = 'ok';
-
} else {
-
$status['state'] = "Not text or html";
-
}
-
-
} else
-
if ($socket_status['timed_out'] == 1) {
-
$status['state'] = "Timed out (no reply from server)";
-
-
} else
-
$status['state'] = "Not text or html";
-
-
}
-
}
-
fclose($fp);
-
return $status;
-
}
-
-
-
$host = 'http://www.admin5.com';
-
$list_exp = '<div class="itembox"';
-
$url_start = '<a href="';
-
$url_end = '" target=';
-
$detail_title_start = '<h1>';
-
$detail_title_end = '</h1>';
-
$detail_summary_start = '<div id="arctext">';
-
$detail_summary_end = '<div id="arctext">';
-
-
$max_page = 179;
-
for($page=$max_page;$page>0;$page--){
-
-
$url = "http://www.admin5.com/browse/26/list_".$page.".shtml";
-
-
$status = url_status($url);
-
-
if($status['content'] == 'text' && $status['state'] == 'ok'){
-
-
$files = getFileContents($url);
-
-
$contents = $files['file'];
-
-
$arr = explode($list_exp, $contents);
-
-
for($i=1;$i<count($arr);$i++){
-
$detail_url = "";
-
$detail_url = strstr($arr[$i], $url_start);
-
$detail_url = str_replace($url_start, "", $detail_url);
-
$pos = strpos($detail_url, $url_end);
-
$detail_url = substr($detail_url, 0, $pos);
-
$detail_url = $host.$detail_url;
-
-
$summary = getFileContents($detail_url);
-
-
print_r($summary);
-
exit;
-
}
-
-
}
-
-
}
-
-
?>
上一篇: 仙剑三之紫萱与长卿忘情湖分别
下一篇: 心情