-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathrun_scrapers.rb
133 lines (123 loc) · 3.75 KB
/
run_scrapers.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
require_relative 'lib/tess_scrapers'
require 'net/smtp'
require 'pathname'
require 'fileutils'
log = 'log/scrapers.log' # The log file for this script, just mentions which scrapers are run.
output = 'log/scrapers.out' # The log file for scraper output. Says how many events etc. were scraped. Need to logrotate this, will be big!
email = ARGV[0] != 'no_email' rescue true
scrapers = [
BabrahamScraper,
#BiocompRdfaScraper,
BioconductorScraper,
#BioconductorJsonldScraper,
#BioschemasScraper,
#BitsvibEventsJsonldScraper,
#BitsvibRdfaScraper,
BiviMaterialScraper,
BiviEventScraper,
#BmtcJsonldScraper,
CambridgeEventsScraper,
CourseraScraper,
#CscEventsScraper,
CscEventsScraperNew,
CvlEventbriteScraper,
#DataCarpentryScraper,
#DataCarpentryEventsScraper,
#DenbiScraper,
DtlsEventsScraper,
#EbiScraper, # Broken old materials one
EbiJsonScraper,
EdinburghScraper,
ElixirEventsScraper,
EnanomapperScraper,
#ErasysRdfaScraper, # Domain changed to erasysapp.eu, breaking old links
#FlemishJsonldEventsScraper,
#FuturelearnRdfaScraper,
#GalaxyEventsScraper,
#GalaxyScraper,
#Genome3dScraper,
#GobletRdfaScraper,
#GobletApiScraper, # See ticket #20
#IfbRdfaScraper,
#IntermineScraper,
#KhanAcademyApiScraper,
LegacySoftwareCarpentryScraper,
#LibraryCarpentryEventsScraper,
LuxembourgRdfaScraper,
#NbisEventsScraper,
#NgsRegistryScraper,
#OpenTargetJsonScraper,
#PortugalEventsScraper,
#PraceEventsScraper,
#RssScraper,
SheffieldScraper,
#SibScraper,
#SibEventsScraper,
#SoftwareCarpentryEventsScraper,
#IannEventsScraper,
#ScilifelabScraper,
#WellcomeEventsScraper
]
options = { output_file: output, debug: false, verbose: false, offline: false, cache: false } # Live!
# options = { output_file: output, debug: true, verbose: true, offline: false, cache: true } # Testing
failed_scrapers = []
begin
# Open log file
log_file = File.open(log, 'w')
begin
dir = Pathname(File.dirname(__FILE__)).expand_path
unless dir.join('email_config.yml').exist?
FileUtils.cp(dir.join('email_config.example.yml'), dir.join('email_config.yml'))
end
email_config = YAML.load(open(dir.join('email_config.yml')))['email']
rescue => e
puts "Couldn't load email_config.yml:"
raise e
end
scrapers.each do |scraper_class|
log_file.puts "Running #{scraper_class}"
exceptions = []
begin
scraper = scraper_class.new(options)
scraper.run
exceptions = scraper.exceptions
rescue => e
exceptions << e
end
failed_scrapers << [scraper_class, exceptions] if exceptions.any?
exceptions.each do |exception|
log_file.puts exception.message
log_file.puts exception.backtrace.join("\n")
log_file.puts
end
end
if email && failed_scrapers.length > 0
message = ''
message << "From: #{email_config['from']}\n"
message << "To: #{email_config['to']}\n"
message << "Sender: #{email_config['sender']}\n" if email_config.key?('sender')
message << "Subject: Scraper Failure (#{failed_scrapers.map { |e| e[0] }.join(', ')})\n"
message << "\n"
message << "It would seem that the following scrapers have failed to run:\n\n"
failed_scrapers.each do |scraper_class, exceptions|
message << "#{scraper_class}:\n"
exceptions.each do |e|
message << " #{e.message}\n"
e.backtrace.each do |t|
message << " #{t}\n"
end
end
end
message << "\n"
begin
Net::SMTP.start(email_config['server']) do |smtp|
smtp.send_message message, email_config['from'], email_config['to']
end
rescue => e
puts "Could not email: #{message} | #{e}"
end
end
log_file.puts 'Done'
ensure
log_file.close
end