php采集远程文章简单类

<?php
/**
 * 采集类
 * @author Milkcy 
 * @copyright            (C) 2012-2015 TCCMS.COM
 * @lastmodify             2012-07-10 14:00
 */
class gather {

    public $pagestring = \'\';
    private $db;

    function __construct() {
        global $db;
        $this->db = $db;
    }

    function geturlfile($url) {
        $url = trim($url);
        $content = \'\';
        if (extension_loaded(\'curl\')) {
            $ch = curl_init();
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
            curl_setopt($ch, CURLOPT_HEADER, 0);
            $content = curl_exec($ch);
            curl_close($ch);
        } else {
            $content = file_get_contents($url);
        }
        return trim($content);
    }

    function get_all_url($code) {
        preg_match_all(\'/<a.+?href=["|\\\']?([^>"\\\' ]+)["|\\\']?\\s*[^>]*>([^>]+)<\\/a>/is\', $code, $arr);
        return array(\'name\' => $arr[2], \'url\' => $arr[1]);
    }

    function get_sub_content($str, $start, $end) {
        $start = trim($start);
        $end = trim($end);
        if ($start == \'\' || $end == \'\') {
            return $str;
        }
        $str = explode($start, $str);
        $str = explode($end, $str[1]);
        return $str[0];
    }

    function vd($var) {
        echo "<div style=\\"border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;\\">\\r\\n";
        echo "<pre style=\\"font-family:Arial,Vrinda;font-size:14px;\\">\\r\\n";
        var_dump($var);
        echo "\\r\\n</pre>\\r\\n";
        echo "</div>";
    }

}

?>

<?php
define(\'ROOT_PATH\', str_replace(\'\\\\\', \'/\', dirname(__FILE__)));
include ROOT_PATH."/gather.class.php";
set_time_limit(0);
header("Content-type: text/html; charset=gb2312");
//目标网址
$url = \'http://news.163.com/special/00013C0O/guojibjtj_03.html\';
//实例化采集机器
$gather = new gather();
//获取目标网址HTML
$html = $gather->geturlfile($url);
//定义采集列表区间
$start = \'<div class="bd clearfix">\';
$end = \'<div class="pages-1 mt25">\';
//获取区间内的文章URL和TITLE
$code = $gather->get_sub_content($html, $start, $end);
$newsAry = $gather->get_all_url($code);
//打印出结果
//$gather->vd($newsAry);
$tarGetUrl = $newsAry[\'url\'][0];
//获取目标网址HTML
$html = $gather->geturlfile($tarGetUrl);
//定义采集列表区间
$start = \'<div id="endText">\';
$end = \'<span class="cDGray right" style="white-space:nowrap;">\';
//获取区间内的文章URL和TITLE
$code = $gather->get_sub_content($html, $start, $end);
$killHtml = \'<iframe src="http://g.163.com/r?site=netease&affiliate=news&cat=article&type=tvscreen200x300&location=1" width="200" height="300" frameborder="no" border="0" marginwidth="0" marginheight="0" scrolling="no"></iframe>\';
$killHtml2 = \'<a href="http://news.163.com/"><img src="http://img1.cache.netease.com/cnews/img07/end_i.gif" alt="netease" width="12" height="11" border="0" class="icon" /></a>\';
$code = str_replace($killHtml, "", $code);
$code = str_replace($killHtml2, "", $code);
$gather->vd($code);
?>
//该片段来自于http://outofmemory.cn

php 文章采集正则代码

//采集html 
function getwebcontent($url){ 
$ch = curl_init(); 
$timeout = 10; 
curl_setopt($ch, CURLOPT_URL, $url); 
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); 
$contents = trim(curl_exec($ch)); 
curl_close($ch); 
return $contents; 
} 


//获得标题和url 
$string = 
getwebcontent(\'http://www.***.com/learn/zhunbeihuaiyun/jijibeiyun/2\'); 
//正则匹配<li>获取标题和地址 
preg_match_all ("/<li><a href=\"\/learn\/article\/(.*)\">(.*)<\/a>/",$string, $out, PREG_SET_ORDER);
foreach($out as $key => $value){ 
$article[\'title\'][] = $out[$key][2]; 
$article[\'link\'][] = "http://www.***.com/learn/article/".$out[$key][1]; 
} 
//根据url获取文章内容 
foreach($article[\'link\'] as $key=>$value){ 
$content_html = getwebcontent($article[\'link\'][$key]); 
preg_match("/<div id=pagenum_0(.*)>[\s|\S]*?<\/div>/",$content_html,$matches); 
$article[content][$key] = $matches[0]; 

} 
//不转码还真不能保存成文件 
foreach($article[title] as $key=>$value){ 
$article[title][$key] = iconv(\'utf-8\', \'gbk\', $value);//转码 
} 
//存入文件 
$num = count($article[\'title\']); 
for($i=0; $i<$num; $i++){ 
file_put_contents("{$article[title][$i]}.txt", $article[\'content\'][$i]); 
} 
?>