-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtranscriptscraper.py
41 lines (34 loc) · 963 Bytes
/
transcriptscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import sys
import urllib2
from bs4 import BeautifulSoup
def get_html(url):
request = urllib2.Request(url)
opener = urllib2.build_opener()
try:
result = opener.open(request).read()
return result
except urllib2.URLError, e:
print 'Error: ' + str(e.reason)
def main():
#read in all the links
text_file = open("links.txt", "r")
links = text_file.read().split('\n')
print len(links)
text_file.close()
#open transcripts.txt for appending
f = open('transcripts.txt', 'a')
for link in links:
html = get_html(link)
if html:
print 'Parsing '+link
output = ''
soup = BeautifulSoup(html, "html.parser")
paras = soup.find_all("span", class_="talk-transcript__para__text")
for p in paras:
for fragment in p.find_all("span", class_="talk-transcript__fragment"):
output += fragment.string + " "
output += "\n\n"
f.write(output.encode('ascii', 'ignore'))
print ("success \n\n")
if __name__ == "__main__":
main()