Tom-Net

自己作的网站采集

1要采集的标题和URL对像

 

using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Text;
/// <summary>
/// mylist 的摘要说明
/// </summary>
public class mylist
{
    private string _url;

    public string Url
    {
        get { return _url; }
        set { _url = value; }
    }
    private string _mytitle;

    public string Mytitle
    {
        get { return _mytitle; }
        set { _mytitle = value; }
    }
 public mylist(string mytitle,string url)
 {
        this._mytitle = mytitle;
        this._url = url;
 }
    /// <summary>
    ///  取得所有标题和链接
    /// </summary>
    /// <param name="input"></param>
    /// <returns></returns>
    public static List<mylist> GetAllList(string input)
    {
        List<mylist> AllList=new List<mylist>();
        StringBuilder sb=new StringBuilder();
        //string pattern mailto:=@%22%3Ca(.*?)href(.*?)\\(\\\'(?<url>[^<].*?)\\\'\\)\\\">(?<mytitle>[^<].*?)<\\/a>";
        sb.Append("<a(.*?)href(.*?)");
        sb.Append("\\");
        sb.Append("(http://www.cnblogs.com/Tom-Net/admin/file://\'/");
        sb.Append("(?<url>[^<].*?)");
        sb.Append("http://www.cnblogs.com/Tom-Net/admin/file://\'//)///">");
        sb.Append("(?<mytitle>[^<].*?)");
        sb.Append("<\\/a>");
        string pattern =sb.ToString();
        RegexOptions options = RegexOptions.None | RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace;
            Regex regex = new Regex(pattern, options);
            MatchCollection matches = regex.Matches(input);

            foreach (Match match in matches)
            {
                AllList.Add(new mylist(match.Groups["mytitle"].Value,match.Groups["url"].Value));
            }
       
        return AllList;
    }
}

2Tools对像

 

using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.IO;
using System.Text;
using System.Collections;
using System.Net;
using System.Xml;
/// <summary>
/// Tools 的摘要说明
/// </summary>
public class Tools
{
 public Tools()
 {
  //
  // TODO: 在此处添加构造函数逻辑
  //
 }
    /// <summary>
    /// 读取本地所有的html页信息
    /// </summary>
    /// <param name="input"></param>
    /// <returns></returns>
    public static string GetAllHtml(string input)
    {
        StreamReader ReaderFile = null;
        string FilePath = HttpContext.Current.Server.MapPath(input);
        Encoding Code = Encoding.GetEncoding("gb2312");
        string strFile = string.Empty;
        try
        {
            ReaderFile = new StreamReader(FilePath, Code);
            strFile = ReaderFile.ReadToEnd();

        }
        catch (Exception ex)
        {
            throw ex;
        }
        finally
        {
           
            ReaderFile.Close();            
        }
        return strFile.ToString();
    }
    /// <summary>
    /// 取得远程网页的所有信息
    /// </summary>
    /// <param name="URL"></param>
    /// <returns></returns>
     public   static string GetPageSource(string URL) 
     {
        Uri uri = new Uri(URL);

        HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
        HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();

        hwReq.Method = "Get";

        hwReq.KeepAlive = false;

        StreamReader reader = new StreamReader(hwRes.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"));

        return reader.ReadToEnd();  
     }
     /// <summary>
     /// 去样式
     /// </summary>
     /// <param name="obj"></param>
     /// <param name="length"></param>
     /// <returns></returns>
     public static string HTMLFilter(Object obj)
     {
         //去掉<style>...</style>标签中的内容
         if (obj == null) return "";
         string strHtml = obj.ToString();
         if (strHtml.Contains("<STYLE") && strHtml.Contains("</STYLE"))
             strHtml = strHtml.Remove(strHtml.IndexOf("<STYLE"), strHtml.IndexOf("</STYLE>") - strHtml.IndexOf("<STYLE"));
         System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"<(?!/?p)[^>]*>");//(@"<[^>]+>|</[^>]+>"); // (@"<\/?(\w+[\s\S]*?)>")
         strHtml = regex.Replace(strHtml, "");
         return strHtml;
     }
   
    /*
    public string wipescript(string html)
    {
        system.text.regularexpressions.regex regex1 = new system.text.regularexpressions.regex(@"<script[\s\s]+</script *>", system.text.regularexpressions.regexoptions.ignorecase);
        system.text.regularexpressions.regex regex2 = new system.text.regularexpressions.regex(@" href *= *[\s\s]*script *:", system.text.regularexpressions.regexoptions.ignorecase);
        system.text.regularexpressions.regex regex3 = new system.text.regularexpressions.regex(@" on[\s\s]*=", system.text.regularexpressions.regexoptions.ignorecase);
        system.text.regularexpressions.regex regex4 = new system.text.regularexpressions.regex(@"<iframe[\s\s]+</iframe *>", system.text.regularexpressions.regexoptions.ignorecase);
        system.text.regularexpressions.regex regex5 = new system.text.regularexpressions.regex(@"<frameset[\s\s]+</frameset *>", system.text.regularexpressions.regexoptions.ignorecase);
        html = regex1.replace(html, ""); //过滤<script></script>标记
        html = regex2.replace(html, ""); //过滤href=javascript: (<a>) 属性
        html = regex3.replace(html, " _disibledevent="); //过滤其它控件的on...事件
        html = regex4.replace(html, ""); //过滤iframe
        html = regex5.replace(html, ""); //过滤frameset
        return html;
    }
     * */
    public static bool CreateXml(string XmlFileName)
    {
        //Encoding.Unicode为生成XML文件的编码格式,到时候合输出<?xml version="1.0" encoding="utf-16"?>
        XmlTextWriter xmlwriter = new XmlTextWriter(XmlFileName, Encoding.UTF8);
        try 
        {                 
            xmlwriter.Formatting = Formatting.Indented;      
            // 这个比较重要,这个属性说明xml文件里面的内容是按级别缩进的。
            //下面开始生成文件的内容  
            xmlwriter.WriteStartDocument();  
            xmlwriter.WriteProcessingInstruction("xml-stylesheet", "type=\"text/css\" href=\"xml.css\"");
            //开始写xml,在最后有一个与之匹配的w.WriteEndDocument();
            xmlwriter.WriteStartElement("SpotList"); 
            //生成一个节点 
            xmlwriter.WriteAttributeString("xmlns:xsi", "http:www.w3.org/2001/XMLSchema-instance");
            //SpotList节点的属性
            xmlwriter.WriteAttributeString("xmlns:xsd", "http:www.w3.org/2001/XMLSchema");
            //SpotList节点属性,最后效果:
            //<SpotList xmlns:xsi="http:www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http:www.w3.org/2001/XMLSchema"> 
            xmlwriter.WriteStartElement("Items");
            //生成一个子节点   
            xmlwriter.WriteElementString("Name", "aa");
            xmlwriter.WriteStartElement("Intro");
            //最后效果:<Intro><![CDATA[相关内容]]></Intro>
            xmlwriter.WriteCData("ckata"); 
            xmlwriter.WriteEndElement();
            xmlwriter.WriteEndElement();
            xmlwriter.WriteEndElement();
            xmlwriter.WriteEndDocument();         
            return true;  
        }
        catch  
        {     
            return false; 
        }  
        finally
        {
            xmlwriter.Close(); //完成xml文件的输出,关闭
           
        }  
    } 
}

3显示所采集到的标题和URL

 

using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text;
using System.Collections.Generic;
using System.Text.RegularExpressions;


public partial class ceshilist : System.Web.UI.Page
{
    protected void Page_Load(object sender, EventArgs e)
    {
        string name = Tools.GetAllHtml("list.htm");
        List<mylist> AllList = new List<mylist>();
        AllList = mylist.GetAllList(name);
        foreach (mylist  singleList in AllList)
        {
            Response.Write("我是标题:"+singleList.Mytitle);
            Response.Write("<br/>");
            Response.Write("我是网址:"+singleList.Url);
            Response.Write("<br/>");
        }
       

    }
}


 

分类:

技术点:

相关文章: