From a0b2c4e7c2eab347edb0d7fd1fba2780fb482035 Mon Sep 17 00:00:00 2001
From: Wesley <276679490@qq.com>
Date: Wed, 6 Jul 2016 23:43:56 +0800
Subject: [PATCH] =?UTF-8?q?=E4=B8=80=E4=B8=AA=E7=AE=80=E5=8D=95=E7=9A=84?=
=?UTF-8?q?=E7=BD=91=E9=A1=B5=E7=88=AC=E8=99=AB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.gitattributes | 63 +++++
.gitignore | 245 ++++++++++++++++++
SimpleCrawler.sln | 22 ++
Wesley.Crawler.SimpleCrawler/App.config | 15 ++
Wesley.Crawler.SimpleCrawler/Models/City.cs | 15 ++
Wesley.Crawler.SimpleCrawler/Models/Hotel.cs | 19 ++
Wesley.Crawler.SimpleCrawler/Program.cs | 131 ++++++++++
.../Properties/AssemblyInfo.cs | 36 +++
Wesley.Crawler.SimpleCrawler/SimpleCrawler.cs | 144 ++++++++++
.../Wesley.Crawler.SimpleCrawler.csproj | 63 +++++
10 files changed, 753 insertions(+)
create mode 100644 .gitattributes
create mode 100644 .gitignore
create mode 100644 SimpleCrawler.sln
create mode 100644 Wesley.Crawler.SimpleCrawler/App.config
create mode 100644 Wesley.Crawler.SimpleCrawler/Models/City.cs
create mode 100644 Wesley.Crawler.SimpleCrawler/Models/Hotel.cs
create mode 100644 Wesley.Crawler.SimpleCrawler/Program.cs
create mode 100644 Wesley.Crawler.SimpleCrawler/Properties/AssemblyInfo.cs
create mode 100644 Wesley.Crawler.SimpleCrawler/SimpleCrawler.cs
create mode 100644 Wesley.Crawler.SimpleCrawler/Wesley.Crawler.SimpleCrawler.csproj
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..1ff0c42
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,63 @@
+###############################################################################
+# Set default behavior to automatically normalize line endings.
+###############################################################################
+* text=auto
+
+###############################################################################
+# Set default behavior for command prompt diff.
+#
+# This is need for earlier builds of msysgit that does not have it on by
+# default for csharp files.
+# Note: This is only used by command line
+###############################################################################
+#*.cs diff=csharp
+
+###############################################################################
+# Set the merge driver for project and solution files
+#
+# Merging from the command prompt will add diff markers to the files if there
+# are conflicts (Merging from VS is not affected by the settings below, in VS
+# the diff markers are never inserted). Diff markers may cause the following
+# file extensions to fail to load in VS. An alternative would be to treat
+# these files as binary and thus will always conflict and require user
+# intervention with every merge. To do so, just uncomment the entries below
+###############################################################################
+#*.sln merge=binary
+#*.csproj merge=binary
+#*.vbproj merge=binary
+#*.vcxproj merge=binary
+#*.vcproj merge=binary
+#*.dbproj merge=binary
+#*.fsproj merge=binary
+#*.lsproj merge=binary
+#*.wixproj merge=binary
+#*.modelproj merge=binary
+#*.sqlproj merge=binary
+#*.wwaproj merge=binary
+
+###############################################################################
+# behavior for image files
+#
+# image files are treated as binary by default.
+###############################################################################
+#*.jpg binary
+#*.png binary
+#*.gif binary
+
+###############################################################################
+# diff behavior for common document formats
+#
+# Convert binary document formats to text before diffing them. This feature
+# is only available from the command line. Turn it on by uncommenting the
+# entries below.
+###############################################################################
+#*.doc diff=astextplain
+#*.DOC diff=astextplain
+#*.docx diff=astextplain
+#*.DOCX diff=astextplain
+#*.dot diff=astextplain
+#*.DOT diff=astextplain
+#*.pdf diff=astextplain
+#*.PDF diff=astextplain
+#*.rtf diff=astextplain
+#*.RTF diff=astextplain
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3a2238d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,245 @@
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+[Xx]64/
+[Xx]86/
+[Bb]uild/
+bld/
+[Bb]in/
+[Oo]bj/
+
+# Visual Studio 2015 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUNIT
+*.VisualState.xml
+TestResult.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# DNX
+project.lock.json
+artifacts/
+
+*_i.c
+*_p.c
+*_i.h
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# JustCode is a .NET coding add-in
+.JustCode
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+
+# TODO: Un-comment the next line if you do not want to checkin
+# your web deploy settings because they may include unencrypted
+# passwords
+#*.pubxml
+*.publishproj
+
+# NuGet Packages
+*.nupkg
+# The packages folder can be ignored because of Package Restore
+**/packages/*
+# except build/, which is used as an MSBuild target.
+!**/packages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/packages/repositories.config
+# NuGet v3's project.json files produces more ignoreable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Microsoft Azure ApplicationInsights config file
+ApplicationInsights.config
+
+# Windows Store app package directory
+AppPackages/
+BundleArtifacts/
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!*.[Cc]ache/
+
+# Others
+ClientBin/
+[Ss]tyle[Cc]op.*
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.pfx
+*.publishsettings
+node_modules/
+orleans.codegen.cs
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+
+# SQL Server files
+*.mdf
+*.ldf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# LightSwitch generated files
+GeneratedArtifacts/
+ModelManifest.xml
+
+# Paket dependency manager
+.paket/paket.exe
+
+# FAKE - F# Make
+.fake/
\ No newline at end of file
diff --git a/SimpleCrawler.sln b/SimpleCrawler.sln
new file mode 100644
index 0000000..30eac47
--- /dev/null
+++ b/SimpleCrawler.sln
@@ -0,0 +1,22 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 14
+VisualStudioVersion = 14.0.25420.1
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wesley.Crawler.SimpleCrawler", "Wesley.Crawler.SimpleCrawler\Wesley.Crawler.SimpleCrawler.csproj", "{B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Any CPU = Debug|Any CPU
+ Release|Any CPU = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}.Release|Any CPU.Build.0 = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/Wesley.Crawler.SimpleCrawler/App.config b/Wesley.Crawler.SimpleCrawler/App.config
new file mode 100644
index 0000000..adb89d1
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/App.config
@@ -0,0 +1,15 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/Wesley.Crawler.SimpleCrawler/Models/City.cs b/Wesley.Crawler.SimpleCrawler/Models/City.cs
new file mode 100644
index 0000000..9ea5365
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/Models/City.cs
@@ -0,0 +1,15 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Wesley.Crawler.SimpleCrawler.Models
+{
+ public class City
+ {
+ public string CityName { get; set; }
+
+ public Uri Uri { get; set; }
+ }
+}
diff --git a/Wesley.Crawler.SimpleCrawler/Models/Hotel.cs b/Wesley.Crawler.SimpleCrawler/Models/Hotel.cs
new file mode 100644
index 0000000..371b258
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/Models/Hotel.cs
@@ -0,0 +1,19 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Wesley.Crawler.SimpleCrawler.Models
+{
+ public class Hotel
+ {
+ public string HotelName { get; set; }
+
+ public decimal Price { get; set; }
+
+ public Uri Uri { get; set; }
+
+
+ }
+}
diff --git a/Wesley.Crawler.SimpleCrawler/Program.cs b/Wesley.Crawler.SimpleCrawler/Program.cs
new file mode 100644
index 0000000..039ffaf
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/Program.cs
@@ -0,0 +1,131 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using System.IO;
+using System.Text.RegularExpressions;
+using System.Collections;
+using System.Net;
+using Wesley.Crawler.SimpleCrawler.Models;
+
+namespace Wesley.Crawler.SimpleCrawler
+{
+ class Program
+ {
+ static void Main(string[] args)
+ {
+ //抓取城市
+ var cityUrl = "http://hotels.ctrip.com/citylist";//定义爬虫入口URL
+ var cityList = new List();//定义泛型列表存放城市名称及对应的酒店URL
+ var cityCrawler = new SimpleCrawler();//调用刚才写的爬虫程序
+ cityCrawler.OnStart += (s, e) =>
+ {
+ Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
+ };
+ cityCrawler.OnError += (s, e) =>
+ {
+ Console.WriteLine("爬虫抓取出现错误:" + e.Message);
+ };
+ cityCrawler.OnCompleted += (s, e) =>
+ {
+ //使用正则表达式清洗网页源代码中的数据
+ var links = Regex.Matches(e.PageSource, @"]+href=""*(?/hotel/[^>\s]+)""\s*[^>]*>(?(?!.*img).*?)", RegexOptions.IgnoreCase);
+ foreach (Match match in links)
+ {
+ var city = new City
+ {
+ CityName = match.Groups["text"].Value,
+ Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value
+ )
+ };
+ if (!cityList.Contains(city)) cityList.Add(city);//将数据加入到泛型列表
+ Console.WriteLine(city.CityName + "|" + city.Uri);//将城市名称及URL显示到控制台
+ }
+ Console.WriteLine("===============================================");
+ Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个城市。");
+ Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
+ Console.WriteLine("线程:" + e.ThreadId);
+ Console.WriteLine("地址:" + e.Uri.ToString());
+ };
+ cityCrawler.Start(new Uri(cityUrl)).Wait();//没被封锁就别使用代理:new WebProxy("60.221.50.118", 8090)
+
+ Console.ReadKey();
+ }
+ }
+
+
+
+ //抓取酒店
+ //var hotelUrl = "http://hotels.ctrip.com/hotel/zunyi558";
+ //var hotelList = new List();
+ //var hotelCrawler = new SimpleCrawler();
+ //hotelCrawler.OnStart += (s, e) =>
+ //{
+ // Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
+ //};
+ //hotelCrawler.OnError += (s, e) =>
+ //{
+ // Console.WriteLine("爬虫抓取出现错误:" + e.Message);
+ //};
+ //hotelCrawler.OnCompleted += (s, e) =>
+ //{
+ // var links = Regex.Matches(e.PageSource, @""">]+href=""*(?/hotel/[^>\s]+)""\s*data-dopost[^>]*>]+>.*?(?.*?)", RegexOptions.IgnoreCase);
+ // foreach (Match match in links)
+ // {
+ // var hotel = new Hotel
+ // {
+ // HotelName = match.Groups["text"].Value,
+ // Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value
+ // )
+ // };
+ // if (!hotelList.Contains(hotel)) hotelList.Add(hotel);//将数据加入到泛型列表
+ // Console.WriteLine(hotel.HotelName + "|" + hotel.Uri);//将酒店名称及详细页URL显示到控制台
+ // }
+
+ // Console.WriteLine();
+ // Console.WriteLine("===============================================");
+ // Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个酒店。");
+ // Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
+ // Console.WriteLine("线程:" + e.ThreadId);
+ // Console.WriteLine("地址:" + e.Uri.ToString());
+ //};
+
+
+
+
+
+ //并发抓取
+ //var hotelList = new List() {
+ // new Hotel { HotelName="遵义浙商酒店", Uri=new Uri("/hotel/4983680.html?isFull=F") },
+ // new Hotel { HotelName="遵义森林大酒店", Uri=new Uri("/hotel/1665124.html?isFull=F") },
+ //};
+ //var hotelCrawler = new SimpleCrawler();
+ //hotelCrawler.OnStart += (s, e) =>
+ //{
+ // Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
+ //};
+ //hotelCrawler.OnError += (s, e) =>
+ //{
+ // Console.WriteLine("爬虫抓取出现错误:" + e.Message);
+ //};
+ //hotelCrawler.OnCompleted += (s, e) =>
+ //{
+ // Console.WriteLine();
+ // Console.WriteLine("===============================================");
+ // Console.WriteLine("爬虫抓取任务完成!");
+ // Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
+ // Console.WriteLine("线程:" + e.ThreadId);
+ // Console.WriteLine("地址:" + e.Uri.ToString());
+ //};
+ //Parallel.For(0, 2 ,(i) =>
+ //{
+ // var hotel = hotelList[i];
+ // hotelCrawler.Start("http://hotels.ctrip.com"+hotel.Uri);
+ //});
+
+
+
+}
+
+
diff --git a/Wesley.Crawler.SimpleCrawler/Properties/AssemblyInfo.cs b/Wesley.Crawler.SimpleCrawler/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..6954e3a
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/Properties/AssemblyInfo.cs
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// 有关程序集的一般信息由以下
+// 控制。更改这些特性值可修改
+// 与程序集关联的信息。
+[assembly: AssemblyTitle("Wesley.Crawler.SimpleCrawler")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("Wesley.Crawler.SimpleCrawler")]
+[assembly: AssemblyCopyright("Copyright © 2016")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+//将 ComVisible 设置为 false 将使此程序集中的类型
+//对 COM 组件不可见。 如果需要从 COM 访问此程序集中的类型,
+//请将此类型的 ComVisible 特性设置为 true。
+[assembly: ComVisible(false)]
+
+// 如果此项目向 COM 公开,则下列 GUID 用于类型库的 ID
+[assembly: Guid("b4e2c232-d3a1-4c03-8ca6-65fc9a5d4b63")]
+
+// 程序集的版本信息由下列四个值组成:
+//
+// 主版本
+// 次版本
+// 生成号
+// 修订号
+//
+//可以指定所有这些值,也可以使用“生成号”和“修订号”的默认值,
+// 方法是按如下所示使用“*”: :
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]
diff --git a/Wesley.Crawler.SimpleCrawler/SimpleCrawler.cs b/Wesley.Crawler.SimpleCrawler/SimpleCrawler.cs
new file mode 100644
index 0000000..083123c
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/SimpleCrawler.cs
@@ -0,0 +1,144 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.IO.Compression;
+using System.Linq;
+using System.Net;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Wesley.Crawler.SimpleCrawler
+{
+ public class SimpleCrawler
+ {
+ public event EventHandler OnStart;//爬虫启动事件
+
+ public event EventHandler OnCompleted;//爬虫完成事件
+
+ public event EventHandler OnError;//爬虫出错事件
+
+ public CookieContainer CookiesContainer { get; set; }//定义Cookie容器
+
+ public SimpleCrawler() { }
+
+
+ ///
+ /// 异步创建爬虫
+ ///
+ /// 爬虫URL地址
+ /// 代理服务器
+ /// 网页源代码
+ public async Task Start(Uri uri,WebProxy proxy=null)
+ {
+ return await Task.Run(() =>
+ {
+ var pageSource = string.Empty;
+ try
+ {
+ if (this.OnStart != null) this.OnStart(this, new OnStartEventArgs(uri));
+ var watch = new Stopwatch();
+ watch.Start();
+ var request = (HttpWebRequest)WebRequest.Create(uri);
+ request.Accept = "*/*";
+ request.ServicePoint.Expect100Continue = false;//加快载入速度
+ request.ServicePoint.UseNagleAlgorithm = false;//禁止Nagle算法加快载入速度
+ request.AllowWriteStreamBuffering = false;//禁止缓冲加快载入速度
+ request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");//定义gzip压缩页面支持
+ request.ContentType = "application/x-www-form-urlencoded";//定义文档类型及编码
+ request.AllowAutoRedirect = false;//禁止自动跳转
+ //设置User-Agent,伪装成Google Chrome浏览器
+ request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36";
+ request.Timeout = 5000;//定义请求超时时间为5秒
+ request.KeepAlive = true;//启用长连接
+ request.Method = "GET";//定义请求方式为GET
+ if (proxy != null)request.Proxy = proxy;//设置代理服务器IP,伪装请求地址
+ request.CookieContainer = this.CookiesContainer;//附加Cookie容器
+ request.ServicePoint.ConnectionLimit = int.MaxValue;//定义最大连接数
+
+ using (var response = (HttpWebResponse)request.GetResponse()) {//获取请求响应
+
+ foreach (Cookie cookie in response.Cookies) this.CookiesContainer.Add(cookie);//将Cookie加入容器,保存登录状态
+
+ if (response.ContentEncoding.ToLower().Contains("gzip"))//解压
+ {
+ using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress))
+ {
+ using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
+ {
+ pageSource = reader.ReadToEnd();
+ }
+ }
+ }
+ else if (response.ContentEncoding.ToLower().Contains("deflate"))//解压
+ {
+ using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress))
+ {
+ using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
+ {
+ pageSource = reader.ReadToEnd();
+ }
+
+ }
+ }
+ else
+ {
+ using (Stream stream = response.GetResponseStream())//原始
+ {
+ using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
+ {
+
+ pageSource= reader.ReadToEnd();
+ }
+ }
+ }
+ }
+ request.Abort();
+ watch.Stop();
+ var threadId = System.Threading.Thread.CurrentThread.ManagedThreadId;//获取当前任务线程ID
+ var milliseconds = watch.ElapsedMilliseconds;//获取请求执行时间
+ if (this.OnCompleted != null) this.OnCompleted(this, new OnCompletedEventArgs(uri, threadId, milliseconds, pageSource));
+ }
+ catch (Exception ex)
+ {
+ if (this.OnError != null) this.OnError(this, ex);
+ }
+ return pageSource;
+ });
+ }
+ }
+
+ ///
+ /// 爬虫启动事件
+ ///
+ public class OnStartEventArgs
+ {
+ public Uri Uri { get; set; }// 爬虫URL地址
+
+ public OnStartEventArgs(Uri uri)
+ {
+ this.Uri = uri;
+ }
+ }
+
+ ///
+ /// 爬虫完成事件
+ ///
+ public class OnCompletedEventArgs
+ {
+ public Uri Uri { get; private set; }// 爬虫URL地址
+ public int ThreadId { get; private set; }// 任务线程ID
+ public string PageSource { get; private set; }// 页面源代码
+ public long Milliseconds { get; private set; }// 爬虫请求执行事件
+ public OnCompletedEventArgs(Uri uri, int threadId, long milliseconds, string pageSource)
+ {
+ this.Uri = uri;
+ this.ThreadId = threadId;
+ this.Milliseconds = milliseconds;
+ this.PageSource = pageSource;
+ }
+ }
+
+
+
+}
diff --git a/Wesley.Crawler.SimpleCrawler/Wesley.Crawler.SimpleCrawler.csproj b/Wesley.Crawler.SimpleCrawler/Wesley.Crawler.SimpleCrawler.csproj
new file mode 100644
index 0000000..7a9a4b1
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/Wesley.Crawler.SimpleCrawler.csproj
@@ -0,0 +1,63 @@
+
+
+
+
+ Debug
+ AnyCPU
+ {B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}
+ Exe
+ Properties
+ Wesley.Crawler.SimpleCrawler
+ Wesley.Crawler.SimpleCrawler
+ v4.5.2
+ 512
+ true
+
+
+ AnyCPU
+ true
+ full
+ false
+ bin\Debug\
+ DEBUG;TRACE
+ prompt
+ 4
+
+
+ AnyCPU
+ pdbonly
+ true
+ bin\Release\
+ TRACE
+ prompt
+ 4
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file