<?php /** * 采集类 * @author Milkcy * @copyright (C) 2012-2015 TCCMS.COM * @lastmodify 2012-07-10 14:00 */ class gather { public $pagestring = \'\'; private $db; function __construct() { global $db; $this->db = $db; } function geturlfile($url) { $url = trim($url); $content = \'\'; if (extension_loaded(\'curl\')) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HEADER, 0); $content = curl_exec($ch); curl_close($ch); } else { $content = file_get_contents($url); } return trim($content); } function get_all_url($code) { preg_match_all(\'/<a.+?href=["|\\\']?([^>"\\\' ]+)["|\\\']?\\s*[^>]*>([^>]+)<\\/a>/is\', $code, $arr); return array(\'name\' => $arr[2], \'url\' => $arr[1]); } function get_sub_content($str, $start, $end) { $start = trim($start); $end = trim($end); if ($start == \'\' || $end == \'\') { return $str; } $str = explode($start, $str); $str = explode($end, $str[1]); return $str[0]; } function vd($var) { echo "<div style=\\"border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;\\">\\r\\n"; echo "<pre style=\\"font-family:Arial,Vrinda;font-size:14px;\\">\\r\\n"; var_dump($var); echo "\\r\\n</pre>\\r\\n"; echo "</div>"; } } ?> <?php define(\'ROOT_PATH\', str_replace(\'\\\\\', \'/\', dirname(__FILE__))); include ROOT_PATH."/gather.class.php"; set_time_limit(0); header("Content-type: text/html; charset=gb2312"); //目标网址 $url = \'http://news.163.com/special/00013C0O/guojibjtj_03.html\'; //实例化采集机器 $gather = new gather(); //获取目标网址HTML $html = $gather->geturlfile($url); //定义采集列表区间 $start = \'<div class="bd clearfix">\'; $end = \'<div class="pages-1 mt25">\'; //获取区间内的文章URL和TITLE $code = $gather->get_sub_content($html, $start, $end); $newsAry = $gather->get_all_url($code); //打印出结果 //$gather->vd($newsAry); $tarGetUrl = $newsAry[\'url\'][0]; //获取目标网址HTML $html = $gather->geturlfile($tarGetUrl); //定义采集列表区间 $start = \'<div id="endText">\'; $end = \'<span class="cDGray right" style="white-space:nowrap;">\'; //获取区间内的文章URL和TITLE $code = $gather->get_sub_content($html, $start, $end); $killHtml = \'<iframe src="http://g.163.com/r?site=netease&affiliate=news&cat=article&type=tvscreen200x300&location=1" width="200" height="300" frameborder="no" border="0" marginwidth="0" marginheight="0" scrolling="no"></iframe>\'; $killHtml2 = \'<a href="http://news.163.com/"><img src="http://img1.cache.netease.com/cnews/img07/end_i.gif" alt="netease" width="12" height="11" border="0" class="icon" /></a>\'; $code = str_replace($killHtml, "", $code); $code = str_replace($killHtml2, "", $code); $gather->vd($code); ?> //该片段来自于http://outofmemory.cn
php 文章采集正则代码
//采集html function getwebcontent($url){ $ch = curl_init(); $timeout = 10; curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); $contents = trim(curl_exec($ch)); curl_close($ch); return $contents; } //获得标题和url $string = getwebcontent(\'http://www.***.com/learn/zhunbeihuaiyun/jijibeiyun/2\'); //正则匹配<li>获取标题和地址 preg_match_all ("/<li><a href=\"\/learn\/article\/(.*)\">(.*)<\/a>/",$string, $out, PREG_SET_ORDER); foreach($out as $key => $value){ $article[\'title\'][] = $out[$key][2]; $article[\'link\'][] = "http://www.***.com/learn/article/".$out[$key][1]; } //根据url获取文章内容 foreach($article[\'link\'] as $key=>$value){ $content_html = getwebcontent($article[\'link\'][$key]); preg_match("/<div id=pagenum_0(.*)>[\s|\S]*?<\/div>/",$content_html,$matches); $article[content][$key] = $matches[0]; } //不转码还真不能保存成文件 foreach($article[title] as $key=>$value){ $article[title][$key] = iconv(\'utf-8\', \'gbk\', $value);//转码 } //存入文件 $num = count($article[\'title\']); for($i=0; $i<$num; $i++){ file_put_contents("{$article[title][$i]}.txt", $article[\'content\'][$i]); } ?>