自己作的网站采集
1要采集的标题和URL对像
using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Text;
/// <summary>
/// mylist 的摘要说明
/// </summary>
public class mylist
{
private string _url;
public string Url
{
get { return _url; }
set { _url = value; }
}
private string _mytitle;
public string Mytitle
{
get { return _mytitle; }
set { _mytitle = value; }
}
public mylist(string mytitle,string url)
{
this._mytitle = mytitle;
this._url = url;
}
/// <summary>
/// 取得所有标题和链接
/// </summary>
/// <param name="input"></param>
/// <returns></returns>
public static List<mylist> GetAllList(string input)
{
List<mylist> AllList=new List<mylist>();
StringBuilder sb=new StringBuilder();
//string pattern mailto:=@%22%3Ca(.*?)href(.*?)\\(\\\'(?<url>[^<].*?)\\\'\\)\\\">(?<mytitle>[^<].*?)<\\/a>";
sb.Append("<a(.*?)href(.*?)");
sb.Append("\\");
sb.Append("(http://www.cnblogs.com/Tom-Net/admin/file://\'/");
sb.Append("(?<url>[^<].*?)");
sb.Append("http://www.cnblogs.com/Tom-Net/admin/file://\'//)///">");
sb.Append("(?<mytitle>[^<].*?)");
sb.Append("<\\/a>");
string pattern =sb.ToString();
RegexOptions options = RegexOptions.None | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace;
Regex regex = new Regex(pattern, options);
MatchCollection matches = regex.Matches(input);
foreach (Match match in matches)
{
AllList.Add(new mylist(match.Groups["mytitle"].Value,match.Groups["url"].Value));
}
return AllList;
}
}
2Tools对像
using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.IO;
using System.Text;
using System.Collections;
using System.Net;
using System.Xml;
/// <summary>
/// Tools 的摘要说明
/// </summary>
public class Tools
{
public Tools()
{
//
// TODO: 在此处添加构造函数逻辑
//
}
/// <summary>
/// 读取本地所有的html页信息
/// </summary>
/// <param name="input"></param>
/// <returns></returns>
public static string GetAllHtml(string input)
{
StreamReader ReaderFile = null;
string FilePath = HttpContext.Current.Server.MapPath(input);
Encoding Code = Encoding.GetEncoding("gb2312");
string strFile = string.Empty;
try
{
ReaderFile = new StreamReader(FilePath, Code);
strFile = ReaderFile.ReadToEnd();
}
catch (Exception ex)
{
throw ex;
}
finally
{
ReaderFile.Close();
}
return strFile.ToString();
}
/// <summary>
/// 取得远程网页的所有信息
/// </summary>
/// <param name="URL"></param>
/// <returns></returns>
public static string GetPageSource(string URL)
{
Uri uri = new Uri(URL);
HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();
hwReq.Method = "Get";
hwReq.KeepAlive = false;
StreamReader reader = new StreamReader(hwRes.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"));
return reader.ReadToEnd();
}
/// <summary>
/// 去样式
/// </summary>
/// <param name="obj"></param>
/// <param name="length"></param>
/// <returns></returns>
public static string HTMLFilter(Object obj)
{
//去掉<style>...</style>标签中的内容
if (obj == null) return "";
string strHtml = obj.ToString();
if (strHtml.Contains("<STYLE") && strHtml.Contains("</STYLE"))
strHtml = strHtml.Remove(strHtml.IndexOf("<STYLE"), strHtml.IndexOf("</STYLE>") - strHtml.IndexOf("<STYLE"));
System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"<(?!/?p)[^>]*>");//(@"<[^>]+>|</[^>]+>"); // (@"<\/?(\w+[\s\S]*?)>")
strHtml = regex.Replace(strHtml, "");
return strHtml;
}
/*
public string wipescript(string html)
{
system.text.regularexpressions.regex regex1 = new system.text.regularexpressions.regex(@"<script[\s\s]+</script *>", system.text.regularexpressions.regexoptions.ignorecase);
system.text.regularexpressions.regex regex2 = new system.text.regularexpressions.regex(@" href *= *[\s\s]*script *:", system.text.regularexpressions.regexoptions.ignorecase);
system.text.regularexpressions.regex regex3 = new system.text.regularexpressions.regex(@" on[\s\s]*=", system.text.regularexpressions.regexoptions.ignorecase);
system.text.regularexpressions.regex regex4 = new system.text.regularexpressions.regex(@"<iframe[\s\s]+</iframe *>", system.text.regularexpressions.regexoptions.ignorecase);
system.text.regularexpressions.regex regex5 = new system.text.regularexpressions.regex(@"<frameset[\s\s]+</frameset *>", system.text.regularexpressions.regexoptions.ignorecase);
html = regex1.replace(html, ""); //过滤<script></script>标记
html = regex2.replace(html, ""); //过滤href=javascript: (<a>) 属性
html = regex3.replace(html, " _disibledevent="); //过滤其它控件的on...事件
html = regex4.replace(html, ""); //过滤iframe
html = regex5.replace(html, ""); //过滤frameset
return html;
}
* */
public static bool CreateXml(string XmlFileName)
{
//Encoding.Unicode为生成XML文件的编码格式,到时候合输出<?xml version="1.0" encoding="utf-16"?>
XmlTextWriter xmlwriter = new XmlTextWriter(XmlFileName, Encoding.UTF8);
try
{
xmlwriter.Formatting = Formatting.Indented;
// 这个比较重要,这个属性说明xml文件里面的内容是按级别缩进的。
//下面开始生成文件的内容
xmlwriter.WriteStartDocument();
xmlwriter.WriteProcessingInstruction("xml-stylesheet", "type=\"text/css\" href=\"xml.css\"");
//开始写xml,在最后有一个与之匹配的w.WriteEndDocument();
xmlwriter.WriteStartElement("SpotList");
//生成一个节点
xmlwriter.WriteAttributeString("xmlns:xsi", "http:www.w3.org/2001/XMLSchema-instance");
//SpotList节点的属性
xmlwriter.WriteAttributeString("xmlns:xsd", "http:www.w3.org/2001/XMLSchema");
//SpotList节点属性,最后效果:
//<SpotList xmlns:xsi="http:www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http:www.w3.org/2001/XMLSchema">
xmlwriter.WriteStartElement("Items");
//生成一个子节点
xmlwriter.WriteElementString("Name", "aa");
xmlwriter.WriteStartElement("Intro");
//最后效果:<Intro><![CDATA[相关内容]]></Intro>
xmlwriter.WriteCData("ckata");
xmlwriter.WriteEndElement();
xmlwriter.WriteEndElement();
xmlwriter.WriteEndElement();
xmlwriter.WriteEndDocument();
return true;
}
catch
{
return false;
}
finally
{
xmlwriter.Close(); //完成xml文件的输出,关闭
}
}
}
3显示所采集到的标题和URL
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text;
using System.Collections.Generic;
using System.Text.RegularExpressions;
public partial class ceshilist : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
string name = Tools.GetAllHtml("list.htm");
List<mylist> AllList = new List<mylist>();
AllList = mylist.GetAllList(name);
foreach (mylist singleList in AllList)
{
Response.Write("我是标题:"+singleList.Mytitle);
Response.Write("<br/>");
Response.Write("我是网址:"+singleList.Url);
Response.Write("<br/>");
}
}
}