抓取页面是一个常见的任务,DotnetSpider是一个在.NET Core平台下的强大的爬虫框架,可以帮助轻松地实现这个目标。本文主要介绍通过DotnetSpider写少量代码快速的实现网页的抓取。

1、 通过Nuget安装引用DotnetSpider

项目上右键 -》选择"管理Nuget程序包" -》搜索"DotnetsSpider" -》点击"DotnetsSpider.Core"安装,还要安装"DotnetSpider.Extension"

Nuget使用教程

2、数据存储EntityPipeline

可以使用框架提供的ConsoleEntityPipeline实现控制台输入,还支持excel、mysql、mongodb等,命名空间在DotnetSpider.Extension.Pipeline下,可以在这个下面查看其它EntityPipeline的实现类,继承自EntityPipeline类,可实现自己的存储逻辑,例如,

    public class StoragePipeline : EntityPipeline
    {
        protected override int Process(List<IBaseEntity> items, dynamic sender = null)
        {
            if (items == null) return 0;
            DateTime dateTime;
            string dateTimeString = string.Empty;
            string path = "./web.txt";
            foreach (var data in items)
            {
                        lock (this)
                        {
                            if (!File.Exists(path))
                            {
                                File.Create(path);
                            }
                            var streamWriter = File.AppendText(path);
                            using (streamWriter)
                            {
                                streamWriter.Write(Newtonsoft.Json.JsonConvert.SerializeObject(data));
                                streamWriter.WriteLine();
                            }
                        }
                    }
                }
            }
            return items.Count;
        }
    }

3、爬虫类的实现

继承EntitySpider类,来实现爬虫类,继承BaseEntity类实现爬虫实体,实体属性要加上 [Column]标签,通过Field标签写Xpath表达式提取内容,和ReplaceFormatter标签实现内容格式的替换,最后值赋给对应的实体属性,例如,

  private class SpiderWeb : EntitySpider
        {
            protected override void OnInit(params string[] arguments)
            {
                var page = 1;
                var listRequest = new List<Request>();
               //循环添加要请求的url
                for (int i = 1; i < 500; i++)
                {
                    page = i;
                  listRequest.Add(new Request(string.Format("https://stackoverflow.com/questions/tagged/python?sort=frequent&page={0}&pagesize=15", page),
                  new Dictionary<string, dynamic> { { "page", page } }));
                }
                AddRequests(listRequest);
                AddEntityType<StackoverflowSearchEntry>();
                //AddPipeline(new ConsoleEntityPipeline());
               AddPipeline(new StoragePipeline());
            }
            [Schema("stackoverflow", "stackoverflow_search_entity_model")]
            [Entity(Expression = "//div[@id='questions']/*", Type = SelectorType.XPath)]
            class StackoverflowSearchEntry : BaseEntity
            {
                [Column]
                [Field(Expression = "page", Type = SelectorType.Enviroment)]
                public string Page { get; set; }
                [Column]
                [Field(Expression = ".//div[@class='summary']/h3/a")]
                [ReplaceFormatter(NewValue = "", OldValue = "<em>")]
                [ReplaceFormatter(NewValue = "", OldValue = "</em>")]
                public string Title { get; set; }
                [Column]
                [Field(Expression = ".//div[@class='summary']/h3/a/@href")]
                public string Url { get; set; }
                [Column]
                [Field(Expression = ".//div[@class='summary']/div[1]")]
                public string description { get; set; }
                //匹配到的完整的内容
                [Column]
                [Field(Expression = ".", Option = FieldOptions.InnerText)]
                public string PlainText { get; set; }
            }
        }

4、DotnetSpider使用完整代码

using DotnetSpider.Downloader;
using DotnetSpider.Extension;
using DotnetSpider.Extension.Model;
using DotnetSpider.Extension.Pipeline;
using DotnetSpider.Extraction;
using DotnetSpider.Extraction.Model;
using DotnetSpider.Extraction.Model.Attribute;
using DotnetSpider.Extraction.Model.Formatter;
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.CompilerServices;
namespace SpiderContent
{
    class Program
    {
        static void Main(string[] args)
        {
            Console.WriteLine("Hello World!");
            var spider = new SpiderWeb();
           //每次抓取的时间间隔,防止抓取频过快
            spider.SleepTime = 1000;
            spider.Run();
            Console.ReadKey();
        }
       
        private class SpiderWeb : EntitySpider
        {
            protected override void OnInit(params string[] arguments)
            {
                var page = 1;
                var listRequest = new List<Request>();
               //循环添加要请求的url
                for (int i = 1; i < 500; i++)
                {
                    page = i;
                  listRequest.Add(new Request(string.Format("https://stackoverflow.com/questions/tagged/python?sort=frequent&page={0}&pagesize=15", page),
                  new Dictionary<string, dynamic> { { "page", page } }));
                }
                AddRequests(listRequest);
                AddEntityType<StackoverflowSearchEntry>();
                //AddPipeline(new ConsoleEntityPipeline());
               AddPipeline(new StoragePipeline());
            }
            [Schema("stackoverflow", "stackoverflow_search_entity_model")]
            [Entity(Expression = "//div[@id='questions']/*", Type = SelectorType.XPath)]
            class StackoverflowSearchEntry : BaseEntity
            {
                [Column]
                [Field(Expression = "page", Type = SelectorType.Enviroment)]
                public string Page { get; set; }
                [Column]
                [Field(Expression = ".//div[@class='summary']/h3/a")]
                [ReplaceFormatter(NewValue = "", OldValue = "<em>")]
                [ReplaceFormatter(NewValue = "", OldValue = "</em>")]
                public string Title { get; set; }
                [Column]
                [Field(Expression = ".//div[@class='summary']/h3/a/@href")]
                public string Url { get; set; }
                [Column]
                [Field(Expression = ".//div[@class='summary']/div[1]")]
                public string description { get; set; }
                [Column]
                [Field(Expression = ".", Option = FieldOptions.InnerText)]
                public string PlainText { get; set; }
            }
        }
    }
    public class StoragePipeline : EntityPipeline
    {
        protected override int Process(List<IBaseEntity> items, dynamic sender = null)
        {
            if (items == null) return 0;
            DateTime dateTime;
            string dateTimeString = string.Empty;
            string path = "./web.txt";
            foreach (var data in items)
            {
                        lock (this)
                        {
                            if (!File.Exists(path))
                            {
                                File.Create(path);
                            }
                            var streamWriter = File.AppendText(path);
                            using (streamWriter)
                            {
                                streamWriter.Write(Newtonsoft.Json.JsonConvert.SerializeObject(data));
                                streamWriter.WriteLine();
                            }
                        }
                    }
                }
            }
            return items.Count;
        }
    }
}

推荐文档