1、新建.NET Core的Console 项目并添加DotnetSpider引用
1) Visual Studio
右键解决方案并启动 Manage NuGet Packages
(管理NuGet包),搜索 DotnetSpider
,从结果列表中选中 DotnetSpider
,安装到项目。
2) Package Manager
Install-Package DotnetSpider -Version 5.0.0-beta1
3) 使用dotnet命令
dotnet add package DotnetSpider --version 5.0.0-beta1
2、添加 Serilog 日志组件
编辑.csproj
项目文件,添加下面引用:
<PackageReference Include="Serilog.AspNetCore" Version="3.2.0"/>
<PackageReference Include="Serilog.Sinks.Console" Version="3.1.1"/>
<PackageReference Include="Serilog.Sinks.RollingFile" Version="3.3.0"/>
<PackageReference Include="Serilog.Sinks.PeriodicBatching" Version="2.3.0"/>
3、创建 GithubSpider 类
public class GithubSpider : Spider
{
public GithubSpider(IOptions<SpiderOptions> options, SpiderServices services, ILogger<Spider> logger) : base(
options, services, logger)
{
}
protected override async Task InitializeAsync(CancellationToken stoppingToken)
{
// 添加自定义解析
AddDataFlow(new Parser());
// 使用控制台存储器
AddDataFlow(new ConsoleStorage());
// 添加采集请求
await AddRequestsAsync("https://github.com/zlzforever");
}
protected override (string Id, string Name) GetIdAndName()
{
return (Guid.NewGuid().ToString("N"), "Github");
}
class Parser : DataParser
{
protected override Task Parse(DataContext context)
{
var selectable = context.Selectable;
// 解析数据
var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']")
?.Value;
var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']")
?.Value;
context.AddData("author", author);
context.AddData("username", name);
return Task.CompletedTask;
}
}
}
4、在 Main 方法中添加如下代码
static async Task Main(string[] args)
{
Log.Logger = new LoggerConfiguration()
.MinimumLevel.Information()
.MinimumLevel.Override("Microsoft.Hosting.Lifetime", LogEventLevel.Warning)
.MinimumLevel.Override("Microsoft", LogEventLevel.Warning)
.MinimumLevel.Override("System", LogEventLevel.Warning)
.MinimumLevel.Override("Microsoft.AspNetCore.Authentication", LogEventLevel.Warning)
.Enrich.FromLogContext()
.WriteTo.Console().WriteTo.RollingFile("logs/spiders.log")
.CreateLogger();
var builder = Builder.CreateDefaultBuilder<GithubSpider>(options =>
{
// 每秒 1 个请求
options.Speed = 1;
// 请求超时
options.RequestTimeout = 10;
});
builder.UseSerilog();
builder.UseQueueDistinctBfsScheduler<HashSetDuplicateRemover>();
await builder.Build().RunAsync();
Environment.Exit(0);
}
5、运行代码查看效果
[17:36:53 INF] Argument: RequestedQueueCount, 100
[17:36:53 INF] Argument: Depth, 0
[17:36:53 INF] Argument: RequestTimeout, 10
[17:36:53 INF] Argument: RetriedTimes, 3
[17:36:53 INF] Argument: EmptySleepTime, 10
[17:36:53 INF] Argument: Speed, 1
[17:36:53 INF] Argument: ProxyTestUri, http://www.baidu.com
[17:36:53 INF] Argument: ProxySupplierUri,
[17:36:53 INF] Argument: UseProxy, False
[17:36:53 INF] Argument: RemoveOutboundLinks, False
[17:36:53 INF] Argument: StorageConnectionString,
[17:36:53 INF] Argument: Storage,
[17:36:53 INF] Argument: ConnectionString,
[17:36:53 INF] Argument: Database, dotnetspider
[17:36:53 INF] Argument: StorageMode, InsertIgnoreDuplicate
[17:36:53 INF] Argument: MySqlFileType, LoadFile
[17:36:53 INF] Argument: SqlServerVersion, V2000
[17:36:53 INF] Argument: HBaseRestServer,
[17:36:53 INF] None proxy supplier
[17:36:53 INF] Statistics service starting
[17:36:53 INF] Agent register service starting
[17:36:53 INF] Statistics service started
[17:36:53 INF] Agent register service started
[17:36:53 INF] Agent starting
[17:36:54 INF] Initialize d9531ecc28a5492ab58e9d8b47a6bf05, Github
[17:36:54 INF] Agent started
[17:36:54 INF] d9531ecc28a5492ab58e9d8b47a6bf05, Github DataFlows: Parser -> ConsoleStorage
[17:36:54 INF] Register topic DOTNET_SPIDER_D9531ECC28A5492AB58E9D8B47A6BF05
[17:36:54 INF] d9531ecc28a5492ab58e9d8b47a6bf05, Github started
[17:36:56 INF] https://github.com/zlzforever download success
[{"Key":"username","Value":"zlzforever"},{"Key":"author","Value":"Lewis Zou"}]
[17:36:58 INF] d9531ecc28a5492ab58e9d8b47a6bf05 total 1, success 1, failed 0, left 0
[17:37:03 INF] d9531ecc28a5492ab58e9d8b47a6bf05 total 1, success 1, failed 0, left 0
[17:37:05 INF] d9531ecc28a5492ab58e9d8b47a6bf05, Github stopping
[17:37:05 INF] d9531ecc28a5492ab58e9d8b47a6bf05, Github stopped
[17:37:05 INF] Agent stopping
[17:37:05 INF] Agent stopped
[17:37:05 INF] Agent register service stopping
[17:37:05 INF] Agent register service stopped
[17:37:05 INF] Statistics service stopping
[17:37:05 INF] Statistics service stopped
原文地址:https://github.com/dotnetcore/DotnetSpider/wiki/1-第一个简单的爬虫
相关文档: