-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathcoursera_offline.py
executable file
·415 lines (360 loc) · 13.8 KB
/
coursera_offline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
#!/usr/bin/python
import argparse
import json
import sys
import urllib2
import cookielib
import random
import urllib
import threading
import os
from pyquery import PyQuery as pq
from crontab import CronTab
from urlparse import urlparse
AUTH_URL = 'https://accounts.coursera.org/api/v1/login'
CLASS_VIDEO_URL_TEMPLATE = 'https://class.coursera.org/%s/lecture'
BASE_URL = 'https://www.coursera.org'
_204_DOMAIN = '.coursera.org'
_204_PATH = '/'
TIMEOUT = 300
DEFAULT_HEADERS = {
'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.94 Chrome/37.0.2062.94 Safari/537.36',
'Accept' : '*/*',
'Accept-Encoding' : 'gzip,deflate,sdch',
'Accept-Language' : 'en-US,en;q=0.8',
'Connection' : 'keep-alive',
'Referer': 'https://accounts.coursera.org/signin',
'Origin': 'https://accounts.coursera.org',
'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/x-www-form-urlencoded'
}
SUB_DIR = 'Subs'
VID_EXT = '.mp4'
SUB_EXT = '.srt'
DATA_FILE = 'data.json'
COOKIE_FILE = 'cookie.cookies'
COURSE_DIR = os.getcwd()
OTHER_DIR = 'Other Files'
SUPPORTED_OTHER_FILE_EXTENSIONS = ['.xlsx', '.pptx', '.pdf']
class Downloader(threading.Thread):
"""Instance of threading.Thread class.
Takes a URL and downloads the video present in the url"""
def __init__(self, url, savepath, cookie, is_sub=False):
threading.Thread.__init__(self)
self.url = url
self.savepath = savepath
self.cookie = cookie
self.is_sub = is_sub
def run(self):
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie))
print 'Downloading to %s' % absolute_path(self.savepath)
f = None
try:
req = urllib2.Request(self.url)
flags = 'w' if self.is_sub else 'wb'
f = open(absolute_path(self.savepath), flags)
f.write(opener.open(req).read())
f.close()
except Exception, e:
if f is not None:
f.close()
if path_exists(self.savepath):
os.remove(absolute_path(self.savepath))
print('Download finished for %s' % absolute_path(self.savepath))
def get_vid_sub_links(anchor_elems):
vid_link = None
sub_link = None
other_links = []
for anchor_elem in anchor_elems:
temp = pq(anchor_elem)
href = temp.attr('href');
if 'subtitles' in href and 'format=srt' in href:
sub_link = href
elif 'download.mp4' in href:
vid_link = href
elif any([ext in href for ext in SUPPORTED_OTHER_FILE_EXTENSIONS]):
other_links.append(href)
return vid_link, sub_link, other_links
def exit_with_message(msg):
# Print the msg and exit the script
print msg
sys.exit()
def has_cookiefile():
return path_exists(absolute_path(COOKIE_FILE))
def parse_arguments():
# Uses argparse.Argument parser to parse
# Commandline arguments
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--shortname", help="""Short name of the course being downloaded.
This option is required when running the script for the first time""")
parser.add_argument("-e", "--email", help="Email id registered with Coursera")
parser.add_argument("-p", "--password", help="Coursera Password")
parser.add_argument("-S", "--synch", help="Give this flag to synch with Coursera", action='store_true')
parser.add_argument("-d", "--dir", help="Give this option to save the videos in the path specified as argument.\
Defaults to Present Working Directory (PWD).")
parser.add_argument("-a", "--auto",
help="Give this option to create a crontab entry inorder to automatically synch with Coursera.\
Argument must one among 'MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'. The argument is optional and\
defaults to 'SUN'",
nargs='?',
const='SUN',
default=None)
args = parser.parse_args()
return args
def validate_arguments(args):
# Checks if both username and password are provided
# Exits the script if either username or password is
# not provided.
if not args:
exit_with_message('')
if not args.email or not args.password:
if not has_cookiefile():
exit_with_message('Please provide both email and password')
if not args.synch and not args.shortname:
exit_with_message('One of the arguments -s or -S must be given')
def create_class_url(classname):
if classname in ('', None):
exit_with_message('Invalid class name')
return CLASS_VIDEO_URL_TEMPLATE % classname
def absolute_path(rel_path):
# Return the path relative to course dir
return os.path.join(COURSE_DIR, rel_path)
def parse_data_file():
# Parse the course data file
if not path_exists(absolute_path(DATA_FILE)):
exit_with_message('Data file does not exist')
try:
f = open(absolute_path(DATA_FILE))
parsed_json = json.load(f)
f.close()
except Exception,e :
exit_with_message(e)
if not parsed_json.has_key('cname') or not parsed_json.has_key('data'):
exit_with_message('Invalid json file')
return parsed_json
def login(email, password):
# Logs into coursera and sets the cookie
# Spoofs the requests to Coursera servers to login
# Returns the cookie jar
cookie_jar = cookielib.LWPCookieJar(absolute_path(COOKIE_FILE))
if has_cookiefile():
cookie_jar.load(ignore_discard=True)
if isLoggedIn(cookie_jar):
return cookie_jar
elif not email or not password:
exit_with_message('Provide email and password')
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
csrf2_token = 'csrf2_token_' + csrfMake(8)
csrf2_token_value = csrfMake()
csrf_token = csrfMake()
csrf2 = get_cookie(csrf2_token, csrf2_token_value)
csrf = get_cookie('csrftoken', csrf_token)
cookie_jar.set_cookie(csrf)
cookie_jar.set_cookie(csrf2)
DEFAULT_HEADERS['X-CSRFToken'] = csrf_token
DEFAULT_HEADERS['X-CSRF2-Token'] = csrf2_token_value
DEFAULT_HEADERS['X-CSRF2-Cookie'] = csrf2_token
data = {
'email' : email,
'password' : password,
'webrequest' : 'true'
}
try:
login_res = opener.open(urllib2.Request(AUTH_URL, urllib.urlencode(data), DEFAULT_HEADERS))
except Exception, e:
exit_with_message(e)
if not isLoggedIn(cookie_jar):
exit_with_message('Login Failed. Try again later')
cookie_jar.save(ignore_discard=True)
return cookie_jar
def normalize_title(title):
return title.replace(':','-').replace('/','-').replace(',', '-')
def download(parsed_json, cookie):
# Downloads the videos in parsed json
# using the cookie which is logged in
if COURSE_DIR is not '':
create_folder('')
# Start the download
threads = []
print 'Downloading videos'
for week_count, sub_json in enumerate(parsed_json['data']):
folder_name = str(week_count) + '-' + normalize_title(sub_json['title'])
create_folder(folder_name)
create_folder(os.path.join(folder_name, SUB_DIR))
create_folder(os.path.join(folder_name, OTHER_DIR))
for count, vid_info in enumerate(sub_json['links']):
title = normalize_title(vid_info['title'])
old_vid_path = os.path.join(folder_name, title + VID_EXT)
old_sub_path = os.path.join(folder_name, SUB_DIR, title + SUB_EXT)
sub_path = os.path.join(folder_name, SUB_DIR, str(count) + '-' + title+SUB_EXT)
vid_path = os.path.join(folder_name, str(count) + '-' + title+VID_EXT)
if path_exists(vid_path) or path_exists(old_vid_path):
print 'Skipping %s' % vid_path
else:
d = Downloader(vid_info['link'], vid_path, cookie)
threads.append(d)
if path_exists(sub_path):
print 'Skipping %s' % sub_path
elif path_exists(old_sub_path):
d = Downloader(vid_info['sub_link'], old_sub_path, cookie, True)
threads.append(d)
else:
d = Downloader(vid_info['sub_link'], sub_path, cookie, True)
threads.append(d)
for other_link in vid_info['other_links']:
u = urlparse(other_link)
other_title = normalize_title(u.path.split('/')[-1])
if not other_title:
continue
other_path = os.path.join(folder_name, OTHER_DIR, str(count) + '-' + other_title)
if path_exists(other_path):
print 'Skipping %s' % other_path
else:
p = Downloader(other_link, other_path, cookie)
threads.append(p)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
def path_exists(path):
return os.path.exists(absolute_path(path))
def create_folder(foldername):
# Creates the folders for each week
# Checks if a folder with that name already exists
# Skip if it exists
if path_exists(foldername):
return False
os.makedirs(absolute_path(foldername))
return True
def data_file_exists():
return path_exists(DATA_FILE)
def set_course_dir(dir):
if not dir:
return
global COURSE_DIR
abspath = os.path.abspath(dir)
if not os.path.exists(abspath):
os.makedirs(abspath)
COURSE_DIR = abspath;
def schedule_synch(day, email, password):
if not day:
exit_with_message('Failed to schedule synch: Invalid day')
if not email or not password:
exit_with_message('Invalid username and password')
day = day.upper()
if not day in ['MON','TUE','WED','THU','FRI','SAT','SUN']:
exit_with_message('Failed to schedule synch: Invalid day')
user_cron = CronTab(user=True)
cmd = "%s -d %s -S -e %s -p %s" % (os.path.abspath(__file__), COURSE_DIR, email, password)
job = user_cron.new(command=cmd)
job.hour.on(11)
job.minute.on(59)
job.dow.on(day)
user_cron.write()
print 'Cron Job added'
def process_course_dir(args):
if args.synch and args.dir:
if not course_dir_exists(args.dir):
exit_with_message('Directory %s does not exist' % (args.dir))
elif args.dir:
set_course_dir(args.dir)
def process_arguments(args):
process_course_dir(args)
if args.shortname:
return args.shortname, None
p = parse_data_file()
return p['shortname'], p
def course_dir_exists(dir):
return os.path.exists(os.path.abspath(dir))
def main():
args = parse_arguments()
validate_arguments(args)
shortname, parsed_json = process_arguments(args)
print('Logging in')
cookie_logged_in = login(args.email, args.password)
if parsed_json is None:
parsed_json = get_course_info(shortname, cookie_logged_in)
save_data_file(parsed_json)
download(parsed_json, cookie_logged_in)
if args.auto:
schedule_synch(args.auto, args.email, args.password)
def save_data_file(parsed_json):
if not parsed_json:
exit_with_message('Invalid data to save')
try:
f = open(absolute_path(DATA_FILE), 'w')
f.write(json.dumps(parsed_json))
f.close()
except Exception, e:
exit_with_message('Failed to save the JSON file')
def get_cookie(name, value):
return cookielib.Cookie(version=0,
name=name,
value=value,
domain=_204_DOMAIN,
domain_specified=False,
domain_initial_dot=False,
path=_204_PATH,
path_specified=False,
secure=False,
expires=None,
comment=None,
comment_url=None,
rest={'HttpOnly':None},
rfc2109=False,
discard=False,
port=None,
port_specified=False
)
def get_course_info(shortname, cookie):
print 'Getting course information %s' % shortname
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
print 'Fetching course videos page'
try:
req = urllib2.Request(create_class_url(shortname))
res = opener.open(req)
response_html = res.read()
except Exception, e:
exit_with_message('Failed to fetch the course information')
try:
doc = pq(response_html)
except Exception, e:
exit_with_message('Failed to parse the html file')
course_info_json = {'cname': shortname, 'data':[]}
html_headers = doc('.course-item-list-header')
try:
html_headers.each(lambda x,y: parse_week_info(x,y,
doc('.course-item-list-section-list').eq(x), course_info_json))
except Exception, e:
exit_with_message('Invalid HTML file receieved')
return course_info_json
def parse_week_info(i, e, sl, j):
de = pq(e)
wt = de('h3').text()
parsed_json = {'title': wt, 'links': []}
for li in sl('li'):
_li = pq(li)
_as = _li('a')
vl, sl, ol = get_vid_sub_links(_as)
parsed_json['links'].append({
'title': pq(_as[0]).text(),
'link': vl,
'sub_link': sl,
'other_links': ol
})
j['data'].append(parsed_json)
def isLoggedIn(cookie):
# Checks if the cookie object has cookies
# necessary for auth
# returns False if None
# returns False if CAUTH cookie is absent, True otherwise
if not cookie: return False
for index, cookie in enumerate(cookie):
if cookie.name == 'CAUTH':
return True
return False
def csrfMake(length=24,
chars='0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'):
return ''.join([chars[int(random.random() * len(chars))] for i in range(length)])
if __name__ == "__main__":
main()