forked from coursera-dl/edx-dl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_parsing.py
154 lines (128 loc) · 6.96 KB
/
test_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import json
import pytest
from edx_dl.common import DEFAULT_FILE_FORMATS
from edx_dl.parsing import (
edx_json2srt,
ClassicEdXPageExtractor,
CurrentEdXPageExtractor,
is_youtube_url,
)
# Test conversion of JSON subtitles to srt
def test_empty_json_subtitle():
with open('test/json/empty.json') as f:
json_string = f.read()
with pytest.raises(ValueError):
json_contents = json.loads(json_string)
@pytest.mark.parametrize(
'file,expected', [
('test/json/empty-text.json', ''),
('test/json/minimal.json', ''),
('test/json/abridged-01.json', ('0\n'
'00:00:18,104 --> 00:00:20,428\n'
'I am very glad to see everyone here,\n\n')),
('test/json/abridged-02.json', ('0\n'
'00:00:18,104 --> 00:00:20,428\n'
'I am very glad to see everyone here,\n\n'
'1\n'
'00:00:20,569 --> 00:00:24,721\n'
'so let\'s enjoy the beauty of combinatorics together.\n\n'))
]
)
def test_subtitles_from_json(file, expected):
with open(file) as f:
json_contents = json.loads(f.read())
res = edx_json2srt(json_contents)
assert res == expected
# Test extraction of video/other assets from HTML
def test_extract_units_from_html_single_unit_multiple_subs():
site = 'https://courses.edx.org'
with open("test/html/single_unit_multiple_subs.html", "r") as f:
units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
site,
DEFAULT_FILE_FORMATS)
assert units[0].videos[0].video_youtube_url == 'https://youtube.com/watch?v=b7xgknqkQk8'
assert units[0].videos[0].mp4_urls[0] == 'https://d2f1egay8yehza.cloudfront.net/edx-edx101/EDXSPCPJSP13-H010000_100.mp4'
assert units[0].videos[0].sub_template_url == 'https://courses.edx.org/courses/edX/DemoX.1/2014/xblock/i4x:;_;_edX;_DemoX.1;_video;_14459340170c476bb65f73a0a08a076f/handler/transcript/translation/%s'
def test_extract_multiple_units_multiple_resources():
site = 'https://courses.edx.org'
with open("test/html/multiple_units.html", "r") as f:
units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
site,
DEFAULT_FILE_FORMATS)
assert len(units) == 3
# this one has multiple speeds in the data-streams field
assert 'https://youtube.com/watch?v=CJ482b9r_0g' in [video.video_youtube_url for video in units[0].videos]
assert len(units[0].videos[0].mp4_urls) > 0
assert 'https://s3.amazonaws.com/berkeley-cs184x/videos/overview-motivation.mp4' in units[0].videos[0].mp4_urls
assert 'https://courses.edx.org/static/content-berkeley-cs184x~2012_Fall/slides/overview.pdf' in units[0].resources_urls
def test_extract_multiple_units_no_youtube_ids():
site = 'https://courses.edx.org'
with open("test/html/multiple_units_no_youtube_ids.html", "r") as f:
units = ClassicEdXPageExtractor().extract_units_from_html(f.read(),
site,
DEFAULT_FILE_FORMATS)
assert units[0].videos[0].video_youtube_url is None
assert len(units[0].videos[0].mp4_urls) > 0
def test_extract_multiple_units_youtube_link():
site = 'https://courses.edx.org'
with open("test/html/multiple_units_youtube_link.html", "r") as f:
units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
site,
DEFAULT_FILE_FORMATS)
assert 'https://www.youtube.com/watch?v=5OXQypOAbdI' in units[0].resources_urls
def test_extract_multiple_units_multiple_youtube_videos():
site = 'https://courses.edx.org'
with open("test/html/multiple_units_multiple_youtube_videos.html", "r") as f:
units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
site,
DEFAULT_FILE_FORMATS)
assert len(units[0].videos) == 3
assert 'https://youtube.com/watch?v=3atHHNa2UwI' in [video.video_youtube_url for video in units[0].videos]
@pytest.mark.parametrize(
'file,num_sections_expected,num_subsections_expected', [
('test/html/new_sections_structure.html', 2, 12),
('test/html/empty_sections.html', 0, 0)
]
)
def test_extract_sections(file, num_sections_expected, num_subsections_expected):
site = 'https://courses.edx.org'
with open(file, "r") as f:
sections = CurrentEdXPageExtractor().extract_sections_from_html(f.read(), site)
assert len(sections) == num_sections_expected
num_subsections = sum(len(section.subsections) for section in sections)
assert num_subsections == num_subsections_expected
@pytest.mark.parametrize(
'filename,site,num_courses_expected,num_available_courses_expected', [
('test/html/dashboard-version-with-articles.html', 'https://courses.edx.org', 18, 14),
('test/html/dashboard-version-with-divs.html', 'https://courses.edx.org', 18, 14),
]
)
def test_extract_courses_from_html(filename, site, num_courses_expected, num_available_courses_expected):
with open(filename, "r") as f:
courses = CurrentEdXPageExtractor().extract_courses_from_html(f.read(), site)
assert len(courses) == num_courses_expected
available_courses = [course for course in courses if course.state == 'Started']
assert len(available_courses) == num_available_courses_expected
def test_is_youtube_url():
invalid_urls = [
'http://www.google.com/', 'TODO',
'https://d2f1egay8yehza.cloudfront.net/mit-24118/MIT24118T314-V015000_DTH.mp4',
'https://courses.edx.org/courses/course-v1:MITx+24.118x+2T2015/xblock/block-v1:MITx+24.118x+2T2015+type@video+block@b1588e7cccff4d448f4f9676c81184d9/handler/transcript/available_translations'
]
valid_urls = [
'http://www.youtu.be/rjOpZ3i6pRo',
'http://www.youtube.com/watch?v=rjOpZ3i6pRo',
'http://youtu.be/rjOpZ3i6pRo',
'http://youtube.com/watch?v=rjOpZ3i6pRo',
'https://www.youtu.be/rjOpZ3i6pRo',
'https://www.youtube.com/watch?v=rjOpZ3i6pRo',
'https://youtu.be/rjOpZ3i6pRo',
'https://youtube.com/watch?v=rjOpZ3i6pRo',
]
for url in invalid_urls:
assert not is_youtube_url(url)
for url in valid_urls:
assert is_youtube_url(url)