-
Notifications
You must be signed in to change notification settings - Fork 72
/
crawler.py
executable file
·181 lines (168 loc) · 7.34 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python3
import argparse
import errno
import json
import os
import re
import sys
import time
import urllib.error
import urllib.request
def log_if_v(msg):
if args.verbose:
print('[DEBUG] %s' % msg, file=sys.stderr)
# Given origin (timestamp offset by time zone) and string from Quora, e.g.
# "Added 31 Jan", returns a string such as '2015-01-31'.
# Quora's short date strings don't provide enough information to determine the
# exact time, unless it was within the last day, so we won't bother to be any
# more precise.
def parse_quora_date(origin, quora_str):
days_of_week = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
months_of_year = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
_, _, date_str = quora_str.partition('Added ')
date_str = date_str.strip()
if date_str == '':
raise ValueError('"%s" does not appear to indicate when answer was added' % quora_str)
m0 = re.match('just now$', date_str)
m1 = re.match('(\d+)m ago$', date_str)
m2 = re.match('(\d+)h ago$', date_str)
m3 = re.match('(' + '|'.join(days_of_week) + ')$', date_str)
m4 = re.match('(' + '|'.join(months_of_year) + ') (\d+)$', date_str)
m5 = re.match('(' + '|'.join(months_of_year) + ') (\d+), (\d+)$', date_str)
m6 = re.match('(\d+)[ap]m$', date_str)
if not m0 is None or not m6 is None:
# Using origin for time in am / pm since the time of the day will be discarded anyway
tm = time.gmtime(origin)
elif not m1 is None:
tm = time.gmtime(origin - 60*int(m1.group(1)))
elif not m2 is None:
tm = time.gmtime(origin - 3600*int(m2.group(1)))
elif not m3 is None:
# Walk backward until we reach the given day of the week
day_of_week = days_of_week.index(m3.group(1))
offset = 1
while offset <= 7:
tm = time.gmtime(origin - 86400*offset)
if tm.tm_wday == day_of_week:
break
offset += 1
else:
raise ValueError('date "%s" is invalid' % date_str)
elif not m4 is None:
# Walk backward until we reach the given month and year
month_of_year = months_of_year.index(m4.group(1)) + 1
day_of_month = int(m4.group(2))
offset = 1
while offset <= 366:
tm = time.gmtime(origin - 86400*offset)
if tm.tm_mon == month_of_year and tm.tm_mday == day_of_month:
break
offset += 1
else:
raise ValueError('date "%s" is invalid' % date_str)
elif not m5 is None:
# may raise ValueError
tm = time.strptime(date_str, '%b %d, %Y')
else:
raise ValueError('date "%s" could not be interpreted' % date_str)
return '%d-%02d-%02d' % (tm.tm_year, tm.tm_mon, tm.tm_mday)
parser = argparse.ArgumentParser(description = 'Download a set of answers from Quora')
parser.add_argument('input_file', help='file containing JSON-encoded list of timestamped URLs to download')
parser.add_argument('output_dir', nargs='?', default='./quora-answers', help='where to store the downloaded answers and images')
parser.add_argument('-d', '--delay', default=0, type=float, help='Time to sleep between answers, in seconds')
parser.add_argument('-t', '--origin_timestamp', default=None, type=int, help='JS time when the list of URLs was fetched')
parser.add_argument('-z', '--origin_timezone', default=None, type=int, help='browser timezone')
parser.add_argument('-v', '--verbose', action='store_true', help='enable debug messages')
parser.add_argument('-o', '--overwrite', action='store_true', help='Overwrite existing answers')
global args
args = parser.parse_args()
# Determine the origin for relative date computation
if args.origin_timestamp is None:
log_if_v('Using current time')
args.origin_timestamp = time.time()
else:
args.origin_timestamp //= 1000
if args.origin_timezone is None:
log_if_v('Using system time zone')
args.origin_timezone = time.timezone
else:
args.origin_timezone *= 60
origin = args.origin_timestamp - args.origin_timezone
# Load the list of answer URLs from the input file.
log_if_v('Loading input file %s' % args.input_file)
with open(args.input_file, 'rb') as input_file:
answers = json.load(input_file)
print('Found %d answers' % len(answers), file=sys.stderr)
# Check the validity of the input
if type(answers) != list:
sys.exit('[FATAL] Incorrect input format')
for e in answers:
if type(e) != list or len(e) != 2 or type(e[0]) != str or type(e[1]) != str:
sys.exit('[FATAL] Incorrect input format')
log_if_v('Creating directory %s' % args.output_dir)
try:
os.mkdir(args.output_dir, 0o700)
except OSError as error:
if error.errno == errno.EEXIST:
log_if_v('Directory already exists')
else:
# This is the top level, and we have nothing else to do if we failed
raise
os.chdir(args.output_dir)
download_file_count = 0
for e in answers:
sys.stderr.flush()
url = e[0]
print('URL: %s' % url, file=sys.stderr)
# Determine the date when this answer was written
try:
added_time = parse_quora_date(origin, e[1])
except ValueError as error:
print('[WARNING] Failed to parse date: %s' % str(error), file=sys.stderr)
added_time = 'xxxx-xx-xx'
print('Date: %s' % added_time, file=sys.stderr)
# Get the part of the URL indicating the question title; we will save under this name
m1 = re.search('quora\.com/([^/]+)/answer', url)
# if there's a context topic
m2 = re.search('quora\.com/[^/]+/([^/]+)/answer', url)
filename = added_time + ' '
if not m1 is None:
filename += m1.group(1)
elif not m2 is None:
filename += m2.group(1)
else:
print('[ERROR] Could not find question part of URL %s; skipping' % url, file=sys.stderr)
continue
# Trim the filename if it's too long. 255 bytes is the limit on many filesystems.
total_byte_length = len(bytes(filename + '.html', encoding="utf-8"))
filename_bytes = bytes(filename, encoding="utf-8")
if total_byte_length > 255:
filename_bytes = filename_bytes[:255-total_byte_length]
well_formed = False
while not well_formed:
try:
filename = str(filename_bytes, encoding="utf-8")
well_formed = True
except UnicodeDecodeError:
filename_bytes = filename_bytes[:-1]
log_if_v('Filename was truncated to at most 255 bytes.')
filename += '.html'
log_if_v('Filename: %s' % filename)
# If overwrite is enabled or the answer doesn't exist
if args.overwrite or not os.path.isfile(filename):
# Fetch the URL to find the answer
log_if_v('Downloading answer from URL %s' % url)
try:
page_html = urllib.request.urlopen(url).read()
with open(filename, 'wb') as f:
f.write(page_html)
except urllib.error.URLError as error:
print('[ERROR] Failed to download answer from URL %s (%s)' % (url, error.reason), file=sys.stderr)
continue
except IOError as error:
print('[ERROR] Failed to save answer to file %s (%s)' % (filename, error.strerror), file=sys.stderr)
download_file_count += 1
time.sleep(args.delay)
else:
log_if_v('Answer File : %s Already Exists. Skipping' % filename)
print('Done. Downloaded %d files' % download_file_count, file=sys.stderr)