forked from fhamborg/news-please
-
Notifications
You must be signed in to change notification settings - Fork 0
/
commoncrawl.py
190 lines (165 loc) · 8.99 KB
/
commoncrawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env python
"""
This scripts downloads WARC files from commoncrawl.org's news crawl and extracts articles from these files. You can
define filter criteria that need to be met (see YOUR CONFIG section), otherwise an article is discarded. Currently, the
script stores the extracted articles in JSON files, but this behaviour can be adapted to your needs in the method
on_valid_article_extracted. To speed up the crawling and extraction process, the script supports multiprocessing. You can
control the number of processes with the parameter my_number_of_extraction_processes.
You can also crawl and extract articles programmatically, i.e., from within
your own code, by using the class CommonCrawlCrawler or the function
commoncrawl_crawler.crawl_from_commoncrawl(...) provided in
newsplease.crawler.commoncrawl_crawler.py. In this case there is also the
possibility of passing in a your own subclass of CommonCrawlExtractor as
extractor_cls=... . One use case here is that your subclass can customise
filtering by overriding `.filter_record(...)`.
In case the script crashes and contains a log message in the beginning that states that only 1 file on AWS storage
was found, make sure that awscli was correctly installed. You can check that by running aws --version from a terminal.
If aws is not installed, you can (on Ubuntu) also install it using sudo apt-get install awscli.
This script uses relative imports to ensure that the latest, local version of news-please is used, instead of the one
that might have been installed with pip. Hence, you must run this script following this workflow.
git clone https://github.com/fhamborg/news-please.git
cd news-please
python3 -m newsplease.examples.commoncrawl
Note that by default the script does not extract main images since they are not contained
WARC files. You can enable extraction of main images by setting `my_fetch_images=True`
"""
import hashlib
import json
import logging
import os
import sys
import datetime
from datetime import date
from ..crawler import commoncrawl_crawler as commoncrawl_crawler
__author__ = "Felix Hamborg"
__copyright__ = "Copyright 2017"
__credits__ = ["Sebastian Nagel"]
############ YOUR CONFIG ############
# download dir for warc files
my_local_download_dir_warc = './cc_download_warc/'
# download dir for articles
my_local_download_dir_article = './cc_download_articles/'
# hosts (if None or empty list, any host is OK)
my_filter_valid_hosts = [] # example: ['elrancaguino.cl']
# start date (if None, any date is OK as start date), as datetime
my_filter_start_date = None # datetime.datetime(2016, 1, 1)
# end date (if None, any date is OK as end date), as datetime
my_filter_end_date = None # datetime.datetime(2016, 12, 31)
# Only .warc files published within [my_warc_files_start_date, my_warc_files_end_date) will be downloaded.
# Note that the date a warc file has been published does not imply it contains only news
# articles from that date. Instead, you must assume that the warc file can contain articles
# from ANY time before the warc file was published, e.g., a warc file published in August 2020
# may contain news articles from December 2016.
my_warc_files_start_date = None # example: datetime.datetime(2020, 3, 1)
my_warc_files_end_date = None # example: datetime.datetime(2020, 3, 2)
# if date filtering is strict and news-please could not detect the date of an article, the article will be discarded
my_filter_strict_date = True
# if True, the script checks whether a file has been downloaded already and uses that file instead of downloading
# again. Note that there is no check whether the file has been downloaded completely or is valid!
my_reuse_previously_downloaded_files = True
# continue after error
my_continue_after_error = True
# show the progress of downloading the WARC files
my_show_download_progress = False
# log_level
my_log_level = logging.INFO
# json export style
my_json_export_style = 1 # 0 (minimize), 1 (pretty)
# number of extraction processes
my_number_of_extraction_processes = 1
# if True, the WARC file will be deleted after all articles have been extracted from it
my_delete_warc_after_extraction = True
# if True, will continue extraction from the latest fully downloaded but not fully extracted WARC files and then
# crawling new WARC files. This assumes that the filter criteria have not been changed since the previous run!
my_continue_process = True
# if True, will crawl and extract main image of each article. Note that the WARC files
# do not contain any images, so that news-please will crawl the current image from
# the articles online webpage, if this option is enabled.
my_fetch_images = False
############ END YOUR CONFIG #########
# logging
logging.basicConfig(level=my_log_level)
__logger = logging.getLogger(__name__)
def __setup__():
"""
Setup
:return:
"""
os.makedirs(my_local_download_dir_article, exist_ok=True)
def __get_pretty_filepath(path, article):
"""
Pretty might be an euphemism, but this function tries to avoid too long filenames, while keeping some structure.
:param path:
:param name:
:return:
"""
short_filename = hashlib.sha256(article.filename.encode()).hexdigest()
sub_dir = article.source_domain
final_path = os.path.join(path, sub_dir)
os.makedirs(final_path, exist_ok=True)
return os.path.join(final_path, short_filename + '.json')
def on_valid_article_extracted(article):
"""
This function will be invoked for each article that was extracted successfully from the archived data and that
satisfies the filter criteria.
:param article:
:return:
"""
# do whatever you need to do with the article (e.g., save it to disk, store it in ElasticSearch, etc.)
with open(__get_pretty_filepath(my_local_download_dir_article, article), 'w', encoding='utf-8') as outfile:
if my_json_export_style == 0:
json.dump(article.__dict__, outfile, default=str, separators=(',', ':'), ensure_ascii=False)
elif my_json_export_style == 1:
json.dump(article.__dict__, outfile, default=str, indent=4, sort_keys=True, ensure_ascii=False)
# ...
def callback_on_warc_completed(warc_path, counter_article_passed, counter_article_discarded,
counter_article_error, counter_article_total, counter_warc_processed):
"""
This function will be invoked for each WARC file that was processed completely. Parameters represent total values,
i.e., cumulated over all all previously processed WARC files.
:param warc_path:
:param counter_article_passed:
:param counter_article_discarded:
:param counter_article_error:
:param counter_article_total:
:param counter_warc_processed:
:return:
"""
pass
def main():
global my_local_download_dir_warc
global my_local_download_dir_article
global my_delete_warc_after_extraction
global my_number_of_extraction_processes
if len(sys.argv) >= 2:
my_local_download_dir_warc = sys.argv[1]
if len(sys.argv) >= 3:
my_local_download_dir_article = sys.argv[2]
if len(sys.argv) >= 4:
my_delete_warc_after_extraction = sys.argv[3] == "delete"
if len(sys.argv) >= 5:
my_number_of_extraction_processes = int(sys.argv[4])
print("my_local_download_dir_warc=" + my_local_download_dir_warc)
print("my_local_download_dir_article=" + my_local_download_dir_article)
print("my_delete_warc_after_extraction=" + str(my_delete_warc_after_extraction))
print("my_number_of_extraction_processes=" + str(my_number_of_extraction_processes))
__setup__()
commoncrawl_crawler.crawl_from_commoncrawl(on_valid_article_extracted,
callback_on_warc_completed=callback_on_warc_completed,
valid_hosts=my_filter_valid_hosts,
start_date=my_filter_start_date,
end_date=my_filter_end_date,
warc_files_start_date=my_warc_files_start_date,
warc_files_end_date=my_warc_files_end_date,
strict_date=my_filter_strict_date,
reuse_previously_downloaded_files=my_reuse_previously_downloaded_files,
local_download_dir_warc=my_local_download_dir_warc,
continue_after_error=my_continue_after_error,
show_download_progress=my_show_download_progress,
number_of_extraction_processes=my_number_of_extraction_processes,
log_level=my_log_level,
delete_warc_after_extraction=my_delete_warc_after_extraction,
continue_process=True,
fetch_images=my_fetch_images)
if __name__ == "__main__":
main()