From a0b2c4e7c2eab347edb0d7fd1fba2780fb482035 Mon Sep 17 00:00:00 2001
From: Wesley <>
Date: Wed, 6 Jul 2016 23:43:56 +0800
Subject: [PATCH] =?UTF-8?q?=E4=B8=80=E4=B8=AA=E7=AE=80=E5=8D=95=E7=9A=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
.gitattributes | 63 +++++
.gitignore | 245 ++++++++++++++++++
SimpleCrawler.sln | 22 ++
Wesley.Crawler.SimpleCrawler/App.config | 15 ++
Wesley.Crawler.SimpleCrawler/Models/City.cs | 15 ++
Wesley.Crawler.SimpleCrawler/Models/Hotel.cs | 19 ++
Wesley.Crawler.SimpleCrawler/Program.cs | 131 ++++++++++
.../Properties/AssemblyInfo.cs | 36 +++
Wesley.Crawler.SimpleCrawler/SimpleCrawler.cs | 144 ++++++++++
.../Wesley.Crawler.SimpleCrawler.csproj | 63 +++++
10 files changed, 753 insertions(+)
create mode 100644 .gitattributes
create mode 100644 .gitignore
create mode 100644 SimpleCrawler.sln
create mode 100644 Wesley.Crawler.SimpleCrawler/App.config
create mode 100644 Wesley.Crawler.SimpleCrawler/Models/City.cs
create mode 100644 Wesley.Crawler.SimpleCrawler/Models/Hotel.cs
create mode 100644 Wesley.Crawler.SimpleCrawler/Program.cs
create mode 100644 Wesley.Crawler.SimpleCrawler/Properties/AssemblyInfo.cs
create mode 100644 Wesley.Crawler.SimpleCrawler/SimpleCrawler.cs
create mode 100644 Wesley.Crawler.SimpleCrawler/Wesley.Crawler.SimpleCrawler.csproj
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..1ff0c42
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,63 @@
+# Set default behavior to automatically normalize line endings.
+* text=auto
+# Set default behavior for command prompt diff.
+# This is need for earlier builds of msysgit that does not have it on by
+# default for csharp files.
+# Note: This is only used by command line
+#*.cs diff=csharp
+# Set the merge driver for project and solution files
+# Merging from the command prompt will add diff markers to the files if there
+# are conflicts (Merging from VS is not affected by the settings below, in VS
+# the diff markers are never inserted). Diff markers may cause the following
+# file extensions to fail to load in VS. An alternative would be to treat
+# these files as binary and thus will always conflict and require user
+# intervention with every merge. To do so, just uncomment the entries below
+#*.sln merge=binary
+#*.csproj merge=binary
+#*.vbproj merge=binary
+#*.vcxproj merge=binary
+#*.vcproj merge=binary
+#*.dbproj merge=binary
+#*.fsproj merge=binary
+#*.lsproj merge=binary
+#*.wixproj merge=binary
+#*.modelproj merge=binary
+#*.sqlproj merge=binary
+#*.wwaproj merge=binary
+# behavior for image files
+# image files are treated as binary by default.
+#*.jpg binary
+#*.png binary
+#*.gif binary
+# diff behavior for common document formats
+# Convert binary document formats to text before diffing them. This feature
+# is only available from the command line. Turn it on by uncommenting the
+# entries below.
+#*.doc diff=astextplain
+#*.DOC diff=astextplain
+#*.docx diff=astextplain
+#*.DOCX diff=astextplain
+#*.dot diff=astextplain
+#*.DOT diff=astextplain
+#*.pdf diff=astextplain
+#*.PDF diff=astextplain
+#*.rtf diff=astextplain
+#*.RTF diff=astextplain
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3a2238d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,245 @@
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+# User-specific files
+# User-specific files (MonoDevelop/Xamarin Studio)
+# Build results
+# Visual Studio 2015 cache/options directory
+# Uncomment if you have tasks that create the project's static files in wwwroot
+# MSTest test Results
+# Build Results of an ATL Project
+# DNX
+# Chutzpah Test files
+# Visual C++ cache files
+# Visual Studio profiler
+# TFS 2012 Local Workspace
+# Guidance Automation Toolkit
+# ReSharper is a .NET coding add-in
+# JustCode is a .NET coding add-in
+# TeamCity is a build add-in
+# DotCover is a Code Coverage Tool
+# NCrunch
+# MightyMoose
+# Web workbench (sass)
+# Installshield output folder
+# DocProject is a documentation generator add-in
+# Click-Once directory
+# Publish Web Output
+# TODO: Un-comment the next line if you do not want to checkin
+# your web deploy settings because they may include unencrypted
+# passwords
+# NuGet Packages
+# The packages folder can be ignored because of Package Restore
+# except build/, which is used as an MSBuild target.
+# Uncomment if necessary however generally it will be regenerated when needed
+# NuGet v3's project.json files produces more ignoreable files
+# Microsoft Azure Build Output
+# Microsoft Azure Emulator
+# Microsoft Azure ApplicationInsights config file
+# Windows Store app package directory
+# Visual Studio cache files
+# files ending in .cache can be ignored
+# but keep track of directories ending in .cache
+# Others
+# RIA/Silverlight projects
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+# SQL Server files
+# Business Intelligence projects
+# Microsoft Fakes
+# GhostDoc plugin setting file
+# Node.js Tools for Visual Studio
+# Visual Studio 6 build log
+# Visual Studio 6 workspace options file
+# Visual Studio LightSwitch build output
+# LightSwitch generated files
+# Paket dependency manager
+# FAKE - F# Make
\ No newline at end of file
diff --git a/SimpleCrawler.sln b/SimpleCrawler.sln
new file mode 100644
index 0000000..30eac47
--- /dev/null
+++ b/SimpleCrawler.sln
@@ -0,0 +1,22 @@
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 14
+VisualStudioVersion = 14.0.25420.1
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Wesley.Crawler.SimpleCrawler", "Wesley.Crawler.SimpleCrawler\Wesley.Crawler.SimpleCrawler.csproj", "{B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}"
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Any CPU = Debug|Any CPU
+ Release|Any CPU = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}.Release|Any CPU.Build.0 = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
diff --git a/Wesley.Crawler.SimpleCrawler/App.config b/Wesley.Crawler.SimpleCrawler/App.config
new file mode 100644
index 0000000..adb89d1
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/App.config
@@ -0,0 +1,15 @@
diff --git a/Wesley.Crawler.SimpleCrawler/Models/City.cs b/Wesley.Crawler.SimpleCrawler/Models/City.cs
new file mode 100644
index 0000000..9ea5365
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/Models/City.cs
@@ -0,0 +1,15 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+namespace Wesley.Crawler.SimpleCrawler.Models
+ public class City
+ {
+ public string CityName { get; set; }
+ public Uri Uri { get; set; }
+ }
diff --git a/Wesley.Crawler.SimpleCrawler/Models/Hotel.cs b/Wesley.Crawler.SimpleCrawler/Models/Hotel.cs
new file mode 100644
index 0000000..371b258
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/Models/Hotel.cs
@@ -0,0 +1,19 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+namespace Wesley.Crawler.SimpleCrawler.Models
+ public class Hotel
+ {
+ public string HotelName { get; set; }
+ public decimal Price { get; set; }
+ public Uri Uri { get; set; }
+ }
diff --git a/Wesley.Crawler.SimpleCrawler/Program.cs b/Wesley.Crawler.SimpleCrawler/Program.cs
new file mode 100644
index 0000000..039ffaf
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/Program.cs
@@ -0,0 +1,131 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using System.IO;
+using System.Text.RegularExpressions;
+using System.Collections;
+using System.Net;
+using Wesley.Crawler.SimpleCrawler.Models;
+namespace Wesley.Crawler.SimpleCrawler
+ class Program
+ {
+ static void Main(string[] args)
+ {
+ //抓取城市
+ var cityUrl = "";//定义爬虫入口URL
+ var cityList = new List();//定义泛型列表存放城市名称及对应的酒店URL
+ var cityCrawler = new SimpleCrawler();//调用刚才写的爬虫程序
+ cityCrawler.OnStart += (s, e) =>
+ {
+ Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
+ };
+ cityCrawler.OnError += (s, e) =>
+ {
+ Console.WriteLine("爬虫抓取出现错误:" + e.Message);
+ };
+ cityCrawler.OnCompleted += (s, e) =>
+ {
+ //使用正则表达式清洗网页源代码中的数据
+ var links = Regex.Matches(e.PageSource, @"]+href=""*(?/hotel/[^>\s]+)""\s*[^>]*>(?(?!.*img).*?)", RegexOptions.IgnoreCase);
+ foreach (Match match in links)
+ {
+ var city = new City
+ {
+ CityName = match.Groups["text"].Value,
+ Uri = new Uri("" + match.Groups["href"].Value
+ )
+ };
+ if (!cityList.Contains(city)) cityList.Add(city);//将数据加入到泛型列表
+ Console.WriteLine(city.CityName + "|" + city.Uri);//将城市名称及URL显示到控制台
+ }
+ Console.WriteLine("===============================================");
+ Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个城市。");
+ Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
+ Console.WriteLine("线程:" + e.ThreadId);
+ Console.WriteLine("地址:" + e.Uri.ToString());
+ };
+ cityCrawler.Start(new Uri(cityUrl)).Wait();//没被封锁就别使用代理:new WebProxy("", 8090)
+ Console.ReadKey();
+ }
+ }
+ //抓取酒店
+ //var hotelUrl = "";
+ //var hotelList = new List();
+ //var hotelCrawler = new SimpleCrawler();
+ //hotelCrawler.OnStart += (s, e) =>
+ //{
+ // Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
+ //};
+ //hotelCrawler.OnError += (s, e) =>
+ //{
+ // Console.WriteLine("爬虫抓取出现错误:" + e.Message);
+ //};
+ //hotelCrawler.OnCompleted += (s, e) =>
+ //{
+ // var links = Regex.Matches(e.PageSource, @""">]+href=""*(?/hotel/[^>\s]+)""\s*data-dopost[^>]*>]+>.*?(?.*?)", RegexOptions.IgnoreCase);
+ // foreach (Match match in links)
+ // {
+ // var hotel = new Hotel
+ // {
+ // HotelName = match.Groups["text"].Value,
+ // Uri = new Uri("" + match.Groups["href"].Value
+ // )
+ // };
+ // if (!hotelList.Contains(hotel)) hotelList.Add(hotel);//将数据加入到泛型列表
+ // Console.WriteLine(hotel.HotelName + "|" + hotel.Uri);//将酒店名称及详细页URL显示到控制台
+ // }
+ // Console.WriteLine();
+ // Console.WriteLine("===============================================");
+ // Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个酒店。");
+ // Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
+ // Console.WriteLine("线程:" + e.ThreadId);
+ // Console.WriteLine("地址:" + e.Uri.ToString());
+ //};
+ //并发抓取
+ //var hotelList = new List() {
+ // new Hotel { HotelName="遵义浙商酒店", Uri=new Uri("/hotel/4983680.html?isFull=F") },
+ // new Hotel { HotelName="遵义森林大酒店", Uri=new Uri("/hotel/1665124.html?isFull=F") },
+ //};
+ //var hotelCrawler = new SimpleCrawler();
+ //hotelCrawler.OnStart += (s, e) =>
+ //{
+ // Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
+ //};
+ //hotelCrawler.OnError += (s, e) =>
+ //{
+ // Console.WriteLine("爬虫抓取出现错误:" + e.Message);
+ //};
+ //hotelCrawler.OnCompleted += (s, e) =>
+ //{
+ // Console.WriteLine();
+ // Console.WriteLine("===============================================");
+ // Console.WriteLine("爬虫抓取任务完成!");
+ // Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
+ // Console.WriteLine("线程:" + e.ThreadId);
+ // Console.WriteLine("地址:" + e.Uri.ToString());
+ //};
+ //Parallel.For(0, 2 ,(i) =>
+ //{
+ // var hotel = hotelList[i];
+ // hotelCrawler.Start(""+hotel.Uri);
+ //});
diff --git a/Wesley.Crawler.SimpleCrawler/Properties/AssemblyInfo.cs b/Wesley.Crawler.SimpleCrawler/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..6954e3a
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/Properties/AssemblyInfo.cs
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+// 有关程序集的一般信息由以下
+// 控制。更改这些特性值可修改
+// 与程序集关联的信息。
+[assembly: AssemblyTitle("Wesley.Crawler.SimpleCrawler")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("Wesley.Crawler.SimpleCrawler")]
+[assembly: AssemblyCopyright("Copyright © 2016")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+//将 ComVisible 设置为 false 将使此程序集中的类型
+//对 COM 组件不可见。 如果需要从 COM 访问此程序集中的类型,
+//请将此类型的 ComVisible 特性设置为 true。
+[assembly: ComVisible(false)]
+// 如果此项目向 COM 公开,则下列 GUID 用于类型库的 ID
+[assembly: Guid("b4e2c232-d3a1-4c03-8ca6-65fc9a5d4b63")]
+// 程序集的版本信息由下列四个值组成:
+// 主版本
+// 次版本
+// 生成号
+// 修订号
+// 方法是按如下所示使用“*”: :
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("")]
+[assembly: AssemblyFileVersion("")]
diff --git a/Wesley.Crawler.SimpleCrawler/SimpleCrawler.cs b/Wesley.Crawler.SimpleCrawler/SimpleCrawler.cs
new file mode 100644
index 0000000..083123c
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/SimpleCrawler.cs
@@ -0,0 +1,144 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.IO.Compression;
+using System.Linq;
+using System.Net;
+using System.Text;
+using System.Threading.Tasks;
+namespace Wesley.Crawler.SimpleCrawler
+ public class SimpleCrawler
+ {
+ public event EventHandler OnStart;//爬虫启动事件
+ public event EventHandler OnCompleted;//爬虫完成事件
+ public event EventHandler OnError;//爬虫出错事件
+ public CookieContainer CookiesContainer { get; set; }//定义Cookie容器
+ public SimpleCrawler() { }
+ ///
+ /// 异步创建爬虫
+ ///
+ /// 爬虫URL地址
+ /// 代理服务器
+ /// 网页源代码
+ public async Task Start(Uri uri,WebProxy proxy=null)
+ {
+ return await Task.Run(() =>
+ {
+ var pageSource = string.Empty;
+ try
+ {
+ if (this.OnStart != null) this.OnStart(this, new OnStartEventArgs(uri));
+ var watch = new Stopwatch();
+ watch.Start();
+ var request = (HttpWebRequest)WebRequest.Create(uri);
+ request.Accept = "*/*";
+ request.ServicePoint.Expect100Continue = false;//加快载入速度
+ request.ServicePoint.UseNagleAlgorithm = false;//禁止Nagle算法加快载入速度
+ request.AllowWriteStreamBuffering = false;//禁止缓冲加快载入速度
+ request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");//定义gzip压缩页面支持
+ request.ContentType = "application/x-www-form-urlencoded";//定义文档类型及编码
+ request.AllowAutoRedirect = false;//禁止自动跳转
+ //设置User-Agent,伪装成Google Chrome浏览器
+ request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36";
+ request.Timeout = 5000;//定义请求超时时间为5秒
+ request.KeepAlive = true;//启用长连接
+ request.Method = "GET";//定义请求方式为GET
+ if (proxy != null)request.Proxy = proxy;//设置代理服务器IP,伪装请求地址
+ request.CookieContainer = this.CookiesContainer;//附加Cookie容器
+ request.ServicePoint.ConnectionLimit = int.MaxValue;//定义最大连接数
+ using (var response = (HttpWebResponse)request.GetResponse()) {//获取请求响应
+ foreach (Cookie cookie in response.Cookies) this.CookiesContainer.Add(cookie);//将Cookie加入容器,保存登录状态
+ if (response.ContentEncoding.ToLower().Contains("gzip"))//解压
+ {
+ using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress))
+ {
+ using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
+ {
+ pageSource = reader.ReadToEnd();
+ }
+ }
+ }
+ else if (response.ContentEncoding.ToLower().Contains("deflate"))//解压
+ {
+ using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress))
+ {
+ using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
+ {
+ pageSource = reader.ReadToEnd();
+ }
+ }
+ }
+ else
+ {
+ using (Stream stream = response.GetResponseStream())//原始
+ {
+ using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
+ {
+ pageSource= reader.ReadToEnd();
+ }
+ }
+ }
+ }
+ request.Abort();
+ watch.Stop();
+ var threadId = System.Threading.Thread.CurrentThread.ManagedThreadId;//获取当前任务线程ID
+ var milliseconds = watch.ElapsedMilliseconds;//获取请求执行时间
+ if (this.OnCompleted != null) this.OnCompleted(this, new OnCompletedEventArgs(uri, threadId, milliseconds, pageSource));
+ }
+ catch (Exception ex)
+ {
+ if (this.OnError != null) this.OnError(this, ex);
+ }
+ return pageSource;
+ });
+ }
+ }
+ ///
+ /// 爬虫启动事件
+ ///
+ public class OnStartEventArgs
+ {
+ public Uri Uri { get; set; }// 爬虫URL地址
+ public OnStartEventArgs(Uri uri)
+ {
+ this.Uri = uri;
+ }
+ }
+ ///
+ /// 爬虫完成事件
+ ///
+ public class OnCompletedEventArgs
+ {
+ public Uri Uri { get; private set; }// 爬虫URL地址
+ public int ThreadId { get; private set; }// 任务线程ID
+ public string PageSource { get; private set; }// 页面源代码
+ public long Milliseconds { get; private set; }// 爬虫请求执行事件
+ public OnCompletedEventArgs(Uri uri, int threadId, long milliseconds, string pageSource)
+ {
+ this.Uri = uri;
+ this.ThreadId = threadId;
+ this.Milliseconds = milliseconds;
+ this.PageSource = pageSource;
+ }
+ }
diff --git a/Wesley.Crawler.SimpleCrawler/Wesley.Crawler.SimpleCrawler.csproj b/Wesley.Crawler.SimpleCrawler/Wesley.Crawler.SimpleCrawler.csproj
new file mode 100644
index 0000000..7a9a4b1
--- /dev/null
+++ b/Wesley.Crawler.SimpleCrawler/Wesley.Crawler.SimpleCrawler.csproj
@@ -0,0 +1,63 @@
+ Debug
+ AnyCPU
+ {B4E2C232-D3A1-4C03-8CA6-65FC9A5D4B63}
+ Exe
+ Properties
+ Wesley.Crawler.SimpleCrawler
+ Wesley.Crawler.SimpleCrawler
+ v4.5.2
+ 512
+ true
+ AnyCPU
+ true
+ full
+ false
+ bin\Debug\
+ prompt
+ 4
+ AnyCPU
+ pdbonly
+ true
+ bin\Release\
+ prompt
+ 4
\ No newline at end of file