怎样使用.NET/C# 获取百度搜索结果2015

知道91 | ASP.NET | 2015-05-19 | 阅读:6080

我们经常会使用程序读取百度的搜索结果以便提供方便。现在很多程序使用.NET来做定时任务,这样怎样使用.NET/C# 获取百度搜索结果呢?

.NET/C# 怎样获取百度搜索结果

我们首先应该分析百度的搜索结果,发现百度的搜索结果的格式为:

百度搜索结果结构

图中标记部分可以知道,百度的搜索结果都是在id="content_left"的结果中的,每个搜索项目的是以class="result c-container"作为一项, 每项中的题目又是包含在h3标签中,如下图所示:

百度搜索结果项

因此我们有了思路:

  1. 根据关键字获取到百度搜索结果的整个HTML文本
  2. 正则匹配到搜索结果容器的HTML
  3. 正则匹配到搜索结果每一项的HTML
  4. 取出每项结果中的题目和链接地址

直接来干的,看下面的代码:

using System;
 using System.Collections.Generic;
 using System.Text;
 using System.Text.RegularExpressions;
 using System.Web;
 using System.Net;
using System.IO;
namespace BaiduSearchTest
{
    struct BaiduEntry
    {
        public string title, brief, link;
    }
    class Program
    {
        static string GetHtml(string keyword)
        {
            string url = @"http://www.baidu.com/";
            string encodedKeyword = HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(936));
            //百度使用codepage 936字符编码来作为查询串,果然专注于中文搜索……
            //更不用说,还很喜欢微软
            //谷歌能正确识别UTF-8编码和codepage这两种情况,不过本身网页在HTTP头里标明是UTF-8的
            //估计谷歌也不讨厌微软(以及微软的专有规范)
            string query = "s?wd=" + encodedKeyword;

            HttpWebRequest req;
            HttpWebResponse response;
            Stream stream;
            req = (HttpWebRequest)WebRequest.Create(url + query);
            response = (HttpWebResponse)req.GetResponse();
            stream = response.GetResponseStream();
            int count = 0;
            byte[] buf = new byte[8192];
            string decodedString = null;
            StringBuilder sb = new StringBuilder();
            try
            {
                Console.WriteLine("正在读取网页{0}的内容……", url + query);
                do
                {
                    count = stream.Read(buf, 0, buf.Length);
                    if (count > 0)
                    {
                        decodedString = Encoding.GetEncoding("utf-8").GetString(buf, 0, count);
                        sb.Append(decodedString);
                    }
                } while (count > 0);
            }
            catch
            {
                Console.WriteLine("网络连接失败,请检查网络设置。");
            }
            return sb.ToString();
        }
        static void PrintResult(List entries)
        {
            int count = 0;
            entries.ForEach(delegate(BaiduEntry entry)
            {
                Console.WriteLine("找到了百度的第{0}条搜索结果:", count += 1);
                if (entry.link != null)
                {
                    Console.WriteLine("找到了一条链接:");
                    Console.WriteLine(entry.link);
                }
                if (entry.title != null)
                {
                    Console.WriteLine("标题为:");
                    Console.WriteLine(entry.title);
                }
                if (entry.brief != null)
                {
                    Console.WriteLine("下面是摘要:");
                    Console.WriteLine(entry.brief);
                }
                Program.Cut();
            });
        }
        static void simpleOutput()
        {
            string html = "
testhello
"; Console.WriteLine(RemoveSomeTags(html)); } static string RemoveVoidTag(string html) { string[] filter = { "
" }; foreach (string tag in filter) { html = html.Replace(tag, ""); } return html; } static string ReleaseXmlTags(string html) { string[] filter = { "", "", "", "", "", "", "", "" }; foreach (string tag in filter) { html = Regex.Replace(html, tag, ""); } return html; } static string RemoveSomeTags(string html) { html = RemoveVoidTag(html); html = ReleaseXmlTags(html); return html; } static void Cut() { Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"); } static void MainProc(string input) { MainProc(input, false); } static void MainProc(string input, bool tagsForBrief) { Regex r = new Regex(@"", RegexOptions.IgnoreCase); MatchCollection matchCollection = r.Matches(input); List collection = new List(); foreach(Match m in matchCollection) { string textReg = @"]*>([\s\S]+?)"; MatchCollection textMatchCollection = Regex.Matches(m.Value, textReg, RegexOptions.IgnoreCase); foreach (Match match in textMatchCollection) { if (match.Success) Console.Write(match.Result("$1")); } string LinkReg = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; MatchCollection linkMatchCollection = Regex.Matches(m.Value, LinkReg, RegexOptions.IgnoreCase); foreach (Match match in linkMatchCollection) { if (match.Success) Console.Write(match.Groups[0].Value); } } } public static void Main(string[] args) { Console.WriteLine("请输入一个关键字。"); string keyword; keyword = Console.ReadLine(); Console.WriteLine("正在从百度上获取结果,请稍等……"); string input; input = GetHtml(keyword); Regex r = new Regex("
", RegexOptions.IgnoreCase); input = r.Match(input).Value; MainProc(input); Console.ReadKey(true); } } }

程序结果如下图所示:

.NET/C# 获取百度搜索结果

通过上面的例子你应该明白怎样使用.NET/C# 获取百度搜索结果项了吧,程序可以直接使用,如果没有得到结果说明是百度搜索的结构变了,请按程序思路改正。