爬虫之博客园精华客户端

　　在博客园学习知识是很方便的，但若做成客户端，自定义获取数据，那就更好啦！

　　那么需求有哪些呢，第一，我只查看推荐数大于2的文章；第二，我想要只查看C#或者Java的文章；第三，我想要查看推荐数大于2的新闻；第四，我还想搜索文章，并且只搜索推荐数大于2的文章。

　　先来预览一下成品吧

爬虫之博客园精华客户端

　　其中列表里左边是推荐数，反正我是优先看推荐数多的，中间是标题，右边是日期，至于其他信息，额，我其实不太关心，点击一行后直接在浏览器打开。

　　额，大体先这样吧，那么实现这些功能需要什么技能呢，首先我得准备一下通用类，大概需要web请求的帮助类、Gzip格式网页的加解密帮助类、html字符串解析的帮助类。

web请求的帮助类：WebHelper

    public class WebHelper
    {
        public readonly WebClient Web = new WebClient();
        //错误重试次数
        private int _tryTimes;

        public Encoding Encoding
        {
            set
            {
                Web.Encoding = value;
            }
        }

        public WebHelper()
        {
            Web.Encoding = Encoding.UTF8;
        }

        public WebHelper(Encoding encoding)
        {
            Web.Encoding = encoding;
        }

        /// <summary>
        /// 下载请求的资源
        /// </summary>
        /// <param name="url">URL</param>
        /// <returns></returns>
        public string DownloadString(string url)
        {
            try
            {
                return Web.DownloadString(url);
            }
            catch(WebException e)
            {
                if (e.Message.Contains("404") || e.Status == WebExceptionStatus.ConnectFailure || e.Status == WebExceptionStatus.ProtocolError || _tryTimes == 2)
                {
                    _tryTimes = 0;
                    return null;
                }

                _tryTimes++;
                return DownloadString(url);
            }
        }

        /// <summary>
        /// 将指定的字符串上载到指定的资源
        /// </summary>
        /// <param name="address">地址</param>
        /// <param name="data">参数</param>
        /// <returns></returns>
        public string UploadString(string address, string data)
        {
            Web.Headers.Add("Content-Type", "application/x-www-form-urlencoded");

            try
            {
                return Web.UploadString(address, "POST", data);
            }
            catch
            {
                if (_tryTimes == 2)
                {
                    _tryTimes = 0;
                    return null;
                }

                _tryTimes++;
                return UploadString(address, data);
            }
        }


        /// <summary>
        ///     下载请求的资源(资源采用Gzip压缩)
        /// </summary>
        /// <param name="url">URL</param>
        /// <param name="encoding">页面编码格式</param>
        /// <returns></returns>
        public string DownloadGzipString(string url, Encoding encoding)
        {
            Web.Headers.Add("Accept-Encoding", "gzip");
            try
            {
                return encoding.GetString(ZipHelper.GzipDecompress(Web.DownloadData(url)));
            }
            catch (WebException e)
            {
                if (e.Message.Contains("404") || e.Status == WebExceptionStatus.ConnectFailure ||
                    e.Status == WebExceptionStatus.ProtocolError || _tryTimes == 2)
                {
                    _tryTimes = 0;
                    return null;
                }

                _tryTimes++;
                return DownloadGzipString(url, encoding);
            }
            finally
            {
                Web.Headers.Remove("Accept-Encoding");
            }
        }
    }

这里有三个方法，其中的DownloadString和UploadString和.net Framework的WebClient的方法用法一样，多了一个DownloadGzipString方法，这个方法用于get一个用Gzip压缩的页面，之所以重复写DownloadString和UploadString是因为我懒，有时候请求网页出现异常并不是该网页不能请求，多请求几次就能获取，这里自动尝试3次请求，3次请求过后依然失败则返回null。当然还有一种情况是需要用代理的，考虑到需要用代理的地方不多，并且代理的IP端口一般需要花钱来买，这里就不贴用代理来请求页面的代码了，之前买过两天耍过代理，我那时候的实现思路就是加一个ProxyPool代理池类，代理池从代理网站获取当前可用的代理，一般是一次获取十几个，然后放入代理池，请求需要代理的网站时就去代理池获取代理，WebClient.Proxy = new WebProxy(host, port);加了这个再去请求页面就可以了，当然代理不一定可靠，所以当失败后不要灰心，再用其他代理试试，总有一个成功的，当需要多线程请求网页时，就new多个WebHelper类，他们都会共用一个ProxyPool代理池的。

Gzip格式网页的加解密帮助类ZipHelper

    public class ZipHelper
    {
        /// <summary>
        /// Gzip压缩
        /// </summary>
        /// <param name="cbytes">需压缩的数据</param>
        /// <returns></returns>
        public static byte[] GzipCompress(byte[] cbytes)
        {
            using (MemoryStream cms = new MemoryStream())
            {
                using (GZipStream gzip = new GZipStream(cms, CompressionMode.Compress))
                {
                    //将数据写入基础流，同时会被压缩
                    gzip.Write(cbytes, 0, cbytes.Length);
                }
                return cms.ToArray();
            }
        }

        /// <summary>
        /// Gzip解压
        /// </summary>
        /// <param name="cbytes">需解压的数据</param>
        /// <returns></returns>
        public static byte[] GzipDecompress(byte[] cbytes)
        {
            using (MemoryStream dms = new MemoryStream())
            {
                using (MemoryStream cms = new MemoryStream(cbytes))
                {
                    using (GZipStream gzip = new GZipStream(cms, CompressionMode.Decompress))
                    {
                        byte[] bytes = new byte[1024];
                        int len = 0;
                        //读取压缩流，同时会被解压
                        while ((len = gzip.Read(bytes, 0, bytes.Length)) > 0)
                        {
                            dms.Write(bytes, 0, len);
                        }
                        return dms.ToArray();
                    }
                }
            }
        }
    }

html字符串解析的帮助类StringHelper

    public class StringHelper
    {
        /// <summary>
        ///     根据传入str进行遍历取出列表
        /// </summary>
        /// <param name="str">传入字符串</param>
        /// <param name="startStr">开始字符串</param>
        /// <param name="endStr">结束字符串</param>
        /// <param name="remove">是否去除开始和结束字符串取出数据</param>
        /// <returns></returns>
        public static List<string> GetList(string str, string startStr, string endStr, bool remove = true)
        {
            var lst = new List<string>();
            int startIndex = 0;
            while (true)
            {
                string v = GetVal(str, startStr, endStr, remove, ref startIndex);
                if (startIndex == -1)
                {
                    break;
                }
                lst.Add(v);
            }
            return lst;
        }

        public static string GetVal(string str, string startStr, string endStr, bool remove = true, int startIndex = 0)
        {
            return GetVal(str, startStr, endStr, remove, ref startIndex);
        }

        private static string GetVal(string str, string startStr, string endStr, bool remove, ref int startIndex)
        {
            int istart = str.IndexOf(startStr, startIndex, StringComparison.CurrentCulture);
            if (istart == -1)
            {
                startIndex = -1;
                return string.Empty;
            }

            int iend = str.IndexOf(endStr, istart + startStr.Length, StringComparison.Ordinal);

            if (iend == -1)
            {
                startIndex = -1;
                return string.Empty;
            }

            startIndex = iend + endStr.Length;

            if (remove)
            {
                istart += startStr.Length;
                return str.Substring(istart, iend - istart);
            }

            return str.Substring(istart, startIndex - istart);
        }

        /// <summary>
        ///     根据传入str进行遍历取出列表
        /// </summary>
        /// <param name="str">传入字符串</param>
        /// <param name="startStr">开始字符串</param>
        /// <param name="needLength">需要获取的长度(不含开始字符串的长度)</param>
        /// <param name="remove">是否去除开始字符串取出数据</param>
        /// <returns></returns>
        public static List<string> GetList(string str, string startStr, int needLength, bool remove = true)
        {
            var lst = new List<string>();
            int startIndex = 0;
            while (true)
            {
                string v = GetVal(str, startStr, needLength, remove, ref startIndex);
                if (startIndex == -1)
                {
                    break;
                }
                lst.Add(v);
            }
            return lst;
        }

        public static string GetVal(string str, string startStr, int needLength, bool remove = true, int startIndex = 0)
        {
            return GetVal(str, startStr, needLength, remove, ref startIndex);
        }

        public static string GetVal(string str, string startStr, int needLength, bool remove, ref int startIndex)
        {
            int istart = str.IndexOf(startStr, startIndex, StringComparison.Ordinal);

            if (istart == -1)
            {
                startIndex = -1;
                return string.Empty;
            }

            startIndex = istart + startStr.Length + needLength;

            if (startIndex > str.Length)
            {
                startIndex = -1;
                return string.Empty;
            }

            return remove
                ? str.Substring(istart + startStr.Length, needLength)
                : str.Substring(istart, startStr.Length + needLength);
        }
        /// <summary>
        ///     获取字符串里的所有href链接
        /// </summary>
        /// <param name="str">字符串</param>
        /// <returns></returns>
        public static List<string> GetUrls(string str)
        {
            return GetList(str, "href=\"", "\"");
        }

        /// <summary>
        ///     获取字符串里的首个href链接
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        public static string GetUrl(string str)
        {
            return GetVal(str, "href=\"", "\"");
        }

        public static string ToGB2312(string str)
        {
            string r = "";
            MatchCollection mc = Regex.Matches(str, @"\\u([\w]{2})([\w]{2})",
                RegexOptions.Compiled | RegexOptions.IgnoreCase);
            var bts = new byte[2];
            foreach (Match m in mc)
            {
                bts[0] = (byte) int.Parse(m.Groups[2].Value, NumberStyles.HexNumber);
                bts[1] = (byte) int.Parse(m.Groups[1].Value, NumberStyles.HexNumber);
                r += Encoding.Unicode.GetString(bts);
            }
            return r;
        }

        /// <summary>
        /// 除去所有在html元素中标记
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        public static string RemoveHTMLTags(string html)
        {
            Regex regex = new Regex(@"<[^>]+>|</[^>]+>");
            return regex.Replace(html, "");
        }
    }

View Code