控制台调用
static void Main(string[] args) { string code = GetEncodings("http://www.cnblogs.com"); Encoding pp = Encoding.GetEncoding(code); string pl = GetHtml("http://www.cnblogs.com", pp); }

下面的代码不重要,只是可以获取标题或其它内容

// 获取网页的HTML内容,根据网页的charset自动判断Encoding 
        static string GetHtml(string url)
        {
            return GetHtmls(url, null);
        }

        // 获取网页的HTML内容,指定Encoding 
        static string GetHtmls(string url, Encoding encoding)
        {
            byte[] buf = new WebClient().DownloadData(url);
            if (encoding != null) return encoding.GetString(buf);
            string html = Encoding.UTF8.GetString(buf);
            encoding = GetEncoding(html);
            if (encoding == null || encoding == Encoding.UTF8) return html;
            return encoding.GetString(buf);
        }

        // 根据网页的HTML内容提取网页的Encoding 
        static Encoding GetEncoding(string html)
        {
            string pattern = @"(?i)\bcharset=(? <charset>[-a-zA-Z_0-9]+)";
            string charset = Regex.Match(html, pattern).Groups["charset"].Value;
            try { return Encoding.GetEncoding(charset); }
            catch (ArgumentException) { return null; }
        }

        // 根据网页的HTML内容提取网页的Title 
        static string GetTitle(string html)
        {
            string pattern = @"(?si) <title(?:\s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(? <title>.*?) </title>";
            return Regex.Match(html, pattern).Groups["title"].Value.Trim();
        }

        // 打印网页的Encoding和Title 
        static void PrintEncodingAndTitle(string url)
        {
            string html = GetHtml(url);
            Console.WriteLine("[{0}] [{1}]", GetEncoding(html), GetTitle(html));
        } 
里面的代码不重要,只是获取其它的内容

相关文章:

  • 2022-12-23
  • 2021-09-20
  • 2022-12-23
  • 2021-05-24
  • 2021-08-04
  • 2022-12-23
  • 2022-02-24
  • 2022-02-07
猜你喜欢
  • 2021-08-08
  • 2022-12-23
  • 2021-07-13
  • 2021-06-26
  • 2021-09-24
相关资源
相似解决方案