-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrawler.cs
145 lines (124 loc) · 5.34 KB
/
Crawler.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
/*
Web Crawler
Takes a domain and look in all pages for emails.
Scrap emails of that pages.
*/
namespace whois_scrapper
{
public class Crawler
{
private String domain; //Domain to scrap
private List<String> links; //Scrapped links to avoid repeated links
private List<String> emails; //Scrapped emails to avoid repeated emails
private static Regex anchorRegex = new Regex("(?<=<a\\s*?href=(?:'|\"))(([\\w\\.\\-\\+]+:)\\/{2}(([\\w\\d\\.]+):([\\w\\d\\.]+))?@?(([a-zA-Z0-9\\.\\-_]+)(?::(\\d{1,5}))?))?(\\/(?:[a-zA-Z0-9\\.\\-\\/\\+\\%]+)?)(?:\\?([a-zA-Z0-9=%\\-_\\.\\*&;]+))?(?:#([a-zA-Z0-9\\-=,&%;\\/\\\"'\\?]+)?)?", RegexOptions.IgnoreCase);
private static Regex emailRegex = new Regex(@"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
private static Regex fileRegex = new Regex(@"^.*\.(jpg|jpeg|png|gif|doc|pdf|avi|css|exe|midi|mid|mp3|raw|mpeg|mpg|ram|rar|tiff|txt|wav|zip|7zip|iso|dmg|js|swf|svg|rss|xml|atom|webm|mp4|ogg|wav|flac)$", RegexOptions.IgnoreCase);
public static int levels = 3; //Default levels to scrap
public static int pages = 100; //Default pages to scrap
//Crawler constructor
public Crawler(String domain)
{
this.domain = domain;
links = new List<String>();
emails = new List<String>();
}
//Main function
//Start the scrap process of a domain
public List<String> scrapWeb()
{
//Recursive page scraping, starting in root domain, returning all unique emails
scrapePage("http://" + domain, domain);
return emails;
}
//Scrape one page with its url
public void scrapePage(String pageUrl, String domain)
{
//constructUrl(pageUrl);
//Check if the url is a file
if (!isAFile(pageUrl) && (links.Count <= pages))
{
//Attempt to get HTML content from url
try
{
WebRequest webRequest;
WebResponse webResponse;
String htmlContent = "";
webRequest = WebRequest.Create(pageUrl);
webResponse = webRequest.GetResponse();
Stream streamResponse = webResponse.GetResponseStream();
StreamReader sreader = new StreamReader(streamResponse);
htmlContent = sreader.ReadToEnd();
streamResponse.Close();
sreader.Close();
webResponse.Close();
//Get emails of the page
getPageEmails(htmlContent, domain);
//Get anchors of the page
List<String> temporalLinks = getPageAnchors(htmlContent, domain);
//Scrap all the unique url finded
foreach (String link in temporalLinks)
{
scrapePage(link, domain);
}
}
catch (WebException e) {}
}
}
//Get links of a HTML page
private List<String> getPageAnchors(String htmlContent, String domain)
{
List<String> newLinks = new List<String>();
//Iterate all anchors of a page
MatchCollection anchorMatches = anchorRegex.Matches(htmlContent);
foreach (Match anchorMatch in anchorMatches)
{
//Check if is the same domain
String urlDomain = WhoisServers.cleanDomain(anchorMatch.Value);
if(urlDomain == domain)
{
String matchedlink = links.Where(link => link.Contains(anchorMatch.Value)).FirstOrDefault();
if (matchedlink == null)
{
//Check if it is under or equal the established level to scrap
Uri uri = new Uri(anchorMatch.Value);
if (uri.Segments.Length <= levels)
{
links.Add(anchorMatch.Value);
newLinks.Add(anchorMatch.Value);
}
}
}
}
return newLinks.Distinct().ToList();
}
//Get emails of a HTML page
private void getPageEmails(String htmlContent, String domain)
{
//Get all emails of the html content
MatchCollection emailMatches = emailRegex.Matches(htmlContent);
//Save emails on the list
foreach (Match emailMatch in emailMatches)
{
//If the email already on the list?
String addedEmail = emails.Where(email => email.Contains(emailMatch.Value)).FirstOrDefault();
if (addedEmail == null)
{
emails.Add(emailMatch.Value);
}
}
}
//Check if the URL is a file or not
private bool isAFile(String url)
{
Match isFile = fileRegex.Match(url);
if (isFile.Success) return true;
return false;
}
}
}