This repository was archived by the owner on Feb 4, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpustools.py
138 lines (111 loc) · 4.86 KB
/
corpustools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# -*- coding: utf-8 -*-
from __future__ import print_function
import urllib
from crawlertools import CrawlerState
from filters import filtertools
from parsertools import HTMLParser
class WebCorpusBuilder(object):
"""Crawls HTML pages on the web and scrapes text data.
The ``WebCorpusBuilder'' crawls the internet recursively, using a set of simple filters to define its behavior. The documentation of the filters is below:
URL BASED FILTERS =====================================================
urlfilter Filter a URL (default: _ => True)
DOCUMENT BASED FILTERS ================================================
tagfilter Filter lowercased tag names in the document (e.g. body).
(default: content tags only see: ``filtertools.tagfilter'')
probefilter Filter entire pages before recursively scraping links.
This is usually important to avoid fast growth of Web
Pages in the queue. You might think of something
Creative to put here but usually less is more.
If you are collecting web pages in a specific language
This could be a good place to apply the white-list.
(default: _ => True)
datafilter Filter each element of the ``splitfilter''.
Your implementation should return a truth-value:
``True'' if the data should be accepted ;
``False'' otherwise. (default _ => True)
TEXT PROCESSORS I/O ===================================================
splitter Split entire pages and return a list of matches.
One could use ``re.findall'' to split results ;
Or, one could split the string on sentences using
The NLTK module (e.g.: ``nltk.tokenize.sent_tokenize'')
(default: _ => [_])
writer Write data out to a file or data structure in
Any way that you wish. This filter may return
A ``None'' type since its return value is never
Used. By default, the write filter prints the
Scrape output to stdout. (default _ => print(_))
TRAVERSAL FUNCTIONS ===================================================
visitor Visitor triggered when visiting a webpage.
Function takes the URL of the page as argument.
Return value is ignored (default: _ => None).
"""
def __init__(self):
"""Creates a new Web Corpus Builder. """
self.urlfilter = lambda p: True
self.tagfilter = filtertools.tagfilter
self.probefilter = lambda t: True
self.datafilter = lambda d: True
self.visitor = lambda p: None
self.splitter = lambda p: [p]
self.writer = lambda d: print(d)
self.parser = HTMLParser(self)
self.crawler = CrawlerState()
def feed(self, pages):
"""Feeds a set of pages to the crawler and crawls them.
:param pages:
:return:
"""
if not hasattr(pages, '__iter__'):
pages = [pages]
for page in pages:
self.crawler.setpage(page)
self.crawl(page)
def crawl(self, page):
"""Initiates crawling of the web once.
Match the URL against preliminary filters
First match the URL itself, if this passes,
Read the contents of the page, then call
The visit filter. If this passes, run the
probe filter. If all these pass, then
Finally, we feed text data to the parser.
:param page: (str) a page URL to crawl
"""
try:
res = urllib.urlopen(page)
data = res.read().decode("utf-8")
self.visitor(page)
if self.probefilter(data):
self.parser.feed(data)
for split in self.splitter(self.parser.resdata):
if self.datafilter(split):
self.writer(split)
except KeyboardInterrupt:
import sys
sys.exit(0)
except UnicodeDecodeError as e:
print('=== %s at %s' % (e, page))
except UnicodeError as e:
print('=== %s at %s' % (e, page))
#if __name__ == '__main__':
# import re
#
# sents = []
#
# def writer(x):
# global sents
# x = x.replace('\n', ' ')
# x = re.sub(r'(\[.*\])|(\(.*\))', ' ', x)
# x = nltk.tokenize.word_tokenize(x)
# sents.append(x)
#
# import nltk
# wcb = WebCorpusBuilder()
# wcb.splitter = nltk.tokenize.sent_tokenize
# wcb.writer = writer
# wcb.feed('https://en.wikipedia.org/wiki/Cats')
#
# for x in wcb.crawler.pages:
# print(x)
#
# #for x in sents:
# # print(x)