-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraping.py
128 lines (102 loc) · 3.87 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python
import urllib2
import mechanize
import cookielib
import re
from bs4 import BeautifulSoup
import time
from joblib import Parallel, delayed
import multiprocessing
import sys
import csv
def doLogin():
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
#br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
# The site we will navigate into, handling it's session
br.open('http://www.mothering.com/forum/login.php?do=login')
# View available forms
#for f in br.forms():
# print f
# Select the second (index one) form (the first form is a search query box)
br.select_form(nr=1)
# User credentials
br.form['vb_login_username'] = 'yourusername'
br.form['vb_login_password'] = 'yourpasswd'
# Login
br.submit()
return br
def doPage(main_url,ipage):
br = doLogin()
page = br.open(main_url+'/index'+str(ipage)+'.html').read()
soup = BeautifulSoup(page)
links = soup.findAll(href = re.compile(main_url+"/\d+(-.*)+.html$"))
return map(lambda x: x['href'],links)
#
#Get texts, usernames and timestamps
#
def doTexts(l):
br = doLogin()
post_lines = list()
post = ""
try:
post_lines = br.open(l).readlines()
post = br.open(l).read()
except: print "HTTP 404 probably...", l
soup = BeautifulSoup(post)
tech_divs1 = soup.find_all('div', attrs={'id': re.compile('post_message_\d*')})
tech_divs1 = filter(lambda x: x!= None,tech_divs1)
texts = map(lambda x: x.get_text().strip().encode("utf-8").translate(None,'\t\r\n'),tech_divs1)
dates = re.findall("dateCreated\">\d+-\d+-\d+, \d+:\d+ A?P?M", post)
dates2 = map(lambda x: x.lstrip("dateCreated\">"), dates)
usernames = filter(lambda x: "nofollow\" class=\"bigusername" in x,post_lines)
usernames2 = map(lambda x: x.split(">")[-2], usernames)
usernames3 = map(lambda x: x.rstrip("</a"), usernames2)
if len(texts) == len(dates2):
return zip(usernames3,dates2,texts)
else: return [("","","")]
def main():
#configurable parameters
num_cores = 30 #multiprocessing.cpu_count()
main_urls = [("http://www.mothering.com/forum/306-unassisted-childbirth",331),
#("http://www.mothering.com/forum/69-vaccinations-archives",1),
("http://www.mothering.com/forum/443-i-m-not-vaccinating",191),
("http://www.mothering.com/forum/373-selective-delayed-vaccination",114),
("http://www.mothering.com/forum/17507-vaccinating-schedule",7)
]
for main_url,nsubpages in main_urls:
forum_label = main_url.split("/")[-1]
start = time.time()
print "Running on ", num_cores, " CPU cores"
print "Scraping ",forum_label
real_links = Parallel(n_jobs=num_cores)(delayed(doPage)(main_url,ipage) for ipage in range(nsubpages))
#somehow we get duplicates.... so set() it
real_links = set([item for sublist in real_links for item in sublist])
end = time.time()
print "Elapsed time %s" % (end-start)
#print real_links
results = Parallel(n_jobs=num_cores)(delayed(doTexts)(l) for l in real_links)
results = [item for sublist in results for item in sublist]
#save the data
with open(forum_label+'_out.csv','w') as out:
csv_out=csv.writer(out,delimiter='|')
csv_out.writerow(['username','timestamp','text'])
for row in results:
csv_out.writerow(row)
end2 = time.time()
print "Total elapsed time %s" % (end2-start)
#Validate
#for l in results:
# print l,"\n"
if __name__=='__main__':
main()