我们经常会使用程序读取百度的搜索结果以便提供方便。现在很多程序使用.NET来做定时任务,这样怎样使用.NET/C# 获取百度搜索结果呢?

我们首先应该分析百度的搜索结果,发现百度的搜索结果的格式为:

图中标记部分可以知道,百度的搜索结果都是在id="content_left"的结果中的,每个搜索项目的是以class="result c-container"作为一项, 每项中的题目又是包含在h3标签中,如下图所示:

因此我们有了思路:
直接来干的,看下面的代码:
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Net;
using System.IO;
namespace BaiduSearchTest
{
struct BaiduEntry
{
public string title, brief, link;
}
class Program
{
static string GetHtml(string keyword)
{
string url = @"http://www.baidu.com/";
string encodedKeyword = HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(936));
//百度使用codepage 936字符编码来作为查询串,果然专注于中文搜索……
//更不用说,还很喜欢微软
//谷歌能正确识别UTF-8编码和codepage这两种情况,不过本身网页在HTTP头里标明是UTF-8的
//估计谷歌也不讨厌微软(以及微软的专有规范)
string query = "s?wd=" + encodedKeyword;
HttpWebRequest req;
HttpWebResponse response;
Stream stream;
req = (HttpWebRequest)WebRequest.Create(url + query);
response = (HttpWebResponse)req.GetResponse();
stream = response.GetResponseStream();
int count = 0;
byte[] buf = new byte[8192];
string decodedString = null;
StringBuilder sb = new StringBuilder();
try
{
Console.WriteLine("正在读取网页{0}的内容……", url + query);
do
{
count = stream.Read(buf, 0, buf.Length);
if (count > 0)
{
decodedString = Encoding.GetEncoding("utf-8").GetString(buf, 0, count);
sb.Append(decodedString);
}
} while (count > 0);
}
catch
{
Console.WriteLine("网络连接失败,请检查网络设置。");
}
return sb.ToString();
}
static void PrintResult(List entries)
{
int count = 0;
entries.ForEach(delegate(BaiduEntry entry)
{
Console.WriteLine("找到了百度的第{0}条搜索结果:", count += 1);
if (entry.link != null)
{
Console.WriteLine("找到了一条链接:");
Console.WriteLine(entry.link);
}
if (entry.title != null)
{
Console.WriteLine("标题为:");
Console.WriteLine(entry.title);
}
if (entry.brief != null)
{
Console.WriteLine("下面是摘要:");
Console.WriteLine(entry.brief);
}
Program.Cut();
});
}
static void simpleOutput()
{
string html = "testhello
";
Console.WriteLine(RemoveSomeTags(html));
}
static string RemoveVoidTag(string html)
{
string[] filter = { "
" };
foreach (string tag in filter)
{
html = html.Replace(tag, "");
}
return html;
}
static string ReleaseXmlTags(string html)
{
string[] filter = { "", "", "", "", "", "", "", "" };
foreach (string tag in filter)
{
html = Regex.Replace(html, tag, "");
}
return html;
}
static string RemoveSomeTags(string html)
{
html = RemoveVoidTag(html);
html = ReleaseXmlTags(html);
return html;
}
static void Cut()
{
Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
}
static void MainProc(string input)
{
MainProc(input, false);
}
static void MainProc(string input, bool tagsForBrief)
{
Regex r = new Regex(@"", RegexOptions.IgnoreCase);
MatchCollection matchCollection = r.Matches(input);
List collection = new List();
foreach(Match m in matchCollection)
{
string textReg = @"]*>([\s\S]+?)";
MatchCollection textMatchCollection = Regex.Matches(m.Value, textReg, RegexOptions.IgnoreCase);
foreach (Match match in textMatchCollection)
{
if (match.Success)
Console.Write(match.Result("$1"));
}
string LinkReg = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
MatchCollection linkMatchCollection = Regex.Matches(m.Value, LinkReg, RegexOptions.IgnoreCase);
foreach (Match match in linkMatchCollection)
{
if (match.Success)
Console.Write(match.Groups[0].Value);
}
}
}
public static void Main(string[] args)
{
Console.WriteLine("请输入一个关键字。");
string keyword;
keyword = Console.ReadLine();
Console.WriteLine("正在从百度上获取结果,请稍等……");
string input;
input = GetHtml(keyword);
Regex r = new Regex("", RegexOptions.IgnoreCase);
input = r.Match(input).Value;
MainProc(input);
Console.ReadKey(true);
}
}
}
程序结果如下图所示:

通过上面的例子你应该明白怎样使用.NET/C# 获取百度搜索结果项了吧,程序可以直接使用,如果没有得到结果说明是百度搜索的结构变了,请按程序思路改正。