本文主要介绍.NET Core中,使用AngleSharp解析百度和谷歌的结果的html代码,提取出结果列表中的标题,文章的链接和描述的方法,以及相关的示例代码。

1、项目中安装引用AngleSharp

AngleSharp:http://anglesharp.github.io/

1)使用Nuget管理控制台

AngleSharp集成到项目中的最简单方法是使用NuGet。您可以通过打开包管理器控制台(PM)并键入以下语句来安装AngleSharp:

Install-Package AngleSharp

2)使用Nuget图形管理器

使用Nuget的界面的管理器搜索"AngleSharp"=> 找到点出点击"安装"

3)使用.NET CLI命令安装

> dotnet add TodoApi.csproj package AngleSharp

相关文档:VS(Visual Studio)中Nuget的使用

2、提取解析百度的搜索结果的html

public static List<SearchResult> GetBaiduResult(string wd, int p)
{
    if (p <= 0) p = 1;
    string url = string.Format("https://www.baidu.com/s?wd={0}&pn={1}", Uri.EscapeDataString(wd), (p - 1) * 10);
    var requester = new DefaultHttpRequester("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36");
    requester.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
    requester.Headers.Add("Referer", "");
    requester.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3");
    var context = BrowsingContext.New(Configuration.Default.WithLocaleBasedEncoding().WithDefaultLoader().WithDefaultCookies().With(requester));
    //根据虚拟请求/响应模式创建文档
    var document = context.OpenAsync(url).Result;
    //var blueListItemsLinq = document.All.Where(m => m.LocalName == "li" && m.ClassList.Contains("blue"));
    //或者直接使用CSS选择器
    var itemsCssSelector = document.QuerySelectorAll("#content_left .result");
    List<SearchResult> results = new List<SearchResult>();
    foreach (var item in itemsCssSelector)
    {
        if (item.QuerySelector("h3 a") == null ||
            item.QuerySelector("h3 a") == null ||
            item.QuerySelector("div.c-abstract") == null ||
            item.QuerySelector("div.f13 a.c-showurl") == null ||
            item.QuerySelector("div.f13 a.m") == null
            )
        {
            continue;
        }
        /*  Console.WriteLine("title = " + item.QuerySelector("h3 a").Html());
          Console.WriteLine("href = " + item.QuerySelector("h3 a").GetAttribute("href"));
          Console.WriteLine("desc = " + item.QuerySelector("div.c-abstract").Html());
          Console.WriteLine("a = " + item.QuerySelector("div.f13  a.c-showurl").Html());
          Console.WriteLine("a_herf = " + item.QuerySelector("div.f13  a.c-showurl").GetAttribute("href"));
          Console.WriteLine("kz = " + item.QuerySelector("div.f13  a.m").GetAttribute("href"));*/
        results.Add(new SearchResult()
        {
            title = item.QuerySelector("h3 a").Html(),
            desc = item.QuerySelector("div.c-abstract").Html(),
            url = item.QuerySelector("h3 a").GetAttribute("href"),
            urlText = item.QuerySelector("div.f13  a.c-showurl").Html(),
            snapshot = item.QuerySelector("div.f13  a.m").GetAttribute("href")
        });
    }
    return results;
}

3、提取解析谷歌的搜索结果的html

public static List<SearchResult> GetGoogleResult(string wd, int p)
 {
     if (p <= 0) p = 1;
     string url = string.Format("https://www.google.com/search?q={0}&start={1}", Uri.EscapeDataString(wd), (p - 1) * 10);
     var requester = new DefaultHttpRequester("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36");
     requester.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
     requester.Headers.Add("Referer", "");
     requester.Headers.Add("Accept-Language", "zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3");
     var context = BrowsingContext.New(Configuration.Default.WithLocaleBasedEncoding().WithDefaultLoader().WithDefaultCookies().With(requester));
     //根据虚拟请求/响应模式创建文档
     var document = context.OpenAsync(url).Result;
     //var blueListItemsLinq = document.All.Where(m => m.LocalName == "li" && m.ClassList.Contains("blue"));
     //或者直接使用CSS选择器
     var itemsCssSelector = document.QuerySelectorAll("#rso > div");
     List<SearchResult> results = new List<SearchResult>();
     foreach (var item in itemsCssSelector)
     {
         try
         {
             //Console.WriteLine("title = " + item.QuerySelector("div > div.r > a > h3").Html());
             //Console.WriteLine("href = " + item.QuerySelector("div > div.r > a").GetAttribute("href"));
             //Console.WriteLine("desc = " + item.QuerySelector("div > div.s > div").Html());
             //Console.WriteLine("a = " + item.QuerySelector("div > div.r > a > div > cite").Html());
             //Console.WriteLine("a_herf = " + item.QuerySelector("div.f13  a.c-showurl").GetAttribute("href"));
             // Console.WriteLine("kz = " + item.QuerySelector("div > div.r > div > div.eFM0qc > span > div > ol > li > a").GetAttribute("href"));
             if (item.QuerySelector("div > div.r > a > h3") == null ||
                 item.QuerySelector("div > div.s > div") == null ||
                 item.QuerySelector("div > div.r > a") == null ||
                 item.QuerySelector("div > div.r > a > div > cite") == null ||
                 item.QuerySelector("div > div.r > div > div.eFM0qc > span > div > ol > li > a") == null
                 )
             {
                 continue;
             }
             results.Add(new SearchResult()
             {
                 title = item.QuerySelector("div > div.r > a > h3").Html(),
                 desc = item.QuerySelector("div > div.s > div").Html(),
                 url = item.QuerySelector("div > div.r > a").GetAttribute("href"),
                 urlText = item.QuerySelector("div > div.r > a > div > cite").Html(),
                 snapshot = item.QuerySelector("div > div.r > div > div.eFM0qc > span > div > ol > li > a").GetAttribute("href")
             });
         }
         catch (Exception ex)
         {
             log.Error(ex);
         }
     }
     return results;
 }

相关文档:.Net(C#) Core安装使用anglesharp解析html的方法及示例代码

推荐文档

相关文档

大家感兴趣的内容

随机列表