-
Notifications
You must be signed in to change notification settings - Fork 1
/
MonkeyScraper.py
209 lines (179 loc) · 6.67 KB
/
MonkeyScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python3
# Scrape a single Survey Monkey survey that you own with a free account.
# This script alleviates the need to sign up for Survey Monkey's premium plans
# if you just want to export the survey responses in a tabular format
#
# This requires that:
# you own the account: You have full access to the account
# the account is free: I did not test this with an upgraded account
# you wish to scrape just one survey: Designed with only this in mind
#
# cacampbell 2/9/17
from lxml import etree
import requests
import logging
class MonkeyScraperException(Exception):
def __init__(self, message, errors):
super(MonkeyScraperException, self).__init__(message)
self.errors = errors
class MonkeyScraper:
username = ''
password = ''
_monkey = 'https://www.surveymonkey.com'
_home_url = _monkey + '/home/'
_login_url = _monkey + '/user/sign-in/'
_logout_url = _monkey + '/user/sign-out/'
_logout_button_selector = '//*[@id="dd-my-account"]/ul/li[5]/a'
_responses_selector = "//*[contains(@class, 'ta-response-item')]"
_headers = {
'User-Agent': 'Mozilla/5.0',
'Upgrade-Insecure-Requests': '1',
'Host': 'www.surveymonkey.com',
'Origin': _monkey,
'Referer': _monkey
}
_session = requests.session()
def __init__(self, *args, **kwargs):
if args:
self.username = args[0]
self.password = args[1]
if kwargs:
if 'username' in kwargs.keys():
self.username = kwargs['username']
if 'password' in kwargs.keys():
self.password = kwargs['password']
def __enter__(self):
self.init()
self.log_in()
return self
def __exit__(self, ex_type, ex_val, traceback):
if traceback:
logging.log(level=logging.CRITICAL, msg=traceback)
self.log_out()
self.close()
def _get_cookies(self):
# These are apparently the cookies that are necessary for navigating
# the SurveyMonkey site without running into forbidden errors (403)
_cookie_keys = keys = ['SSLB', 'apex__sm', 'auth', 'session', 'sm_rec',
'ep201', 'ep202', 'tld_user', 'tld_set',
'endpages_seen', 'SSRT', 'ucs_topbar_views']
if self._session.cookies:
cookies = {}
for key in _cookie_keys:
try:
cookies[key] = self._session.cookies[key]
except KeyError:
pass
return cookies
else:
raise(
MonkeyScraperException(
"Premature Cookie Robbery",
{"No Cookies": "The session CookieJar is empty"}
)
)
def _get(self, url='', headers=None, **kwargs):
# Wrap get to allow keyword usage
return(self._session.get(url=url, headers=headers, data=kwargs))
def _post(self, url='', headers=None, **kwargs):
# Wrap post to allow keyword usage
return(self._session.post(url=url, headers=headers, data=kwargs))
def _check_code(self, resp):
c = resp.status_code
if c != 200:
raise(
MonkeyScraperException(
"Not Okay (200)",
{
'{}'.format(c),
'Server sent {}. Expected 200.'.format(c)
}
)
)
def _page_root(self, response):
return etree.HTML(response.content)
def _logged_in(self):
# Check if logged in by looking for a log-out button on the home
# page.
resp = self._get(url=self._home_url,
headers=self._headers,
**self._get_cookies())
self._check_code(resp)
LOGOUT_BUTTON = etree.XPath(self._logout_button_selector)
try:
LOGOUT_BUTTON(self._page_root(resp))[0] # exists?
except IndexError as err:
return False
return True
def check_logged_in(self):
# If not logged in, scream about it
if not self._logged_in():
raise(
MonkeyScraperException(
"Logged Out",
{
"Logged Out",
"The session was unexpectedly logged out"
}
)
)
def init(self):
# Need to have the server send back cookies needed for navigating
# the site. I chose the log in url because this is needed anyway and
# harmlessly redirects to the home page if already logged in.
self._get(url=self._login_url, headers=self._headers)
def log_in(self, username='', password=''):
if self._logged_in():
logging.log(level=logging.WARNING, msg='Already logged in')
return
if not username:
username = self.username
if not password:
password = self.password
if not (username and password):
raise(
MonkeyScraperException(
"LogIn Failed",
{'Incorrect Arguments': 'Required: username and password'}
)
)
login_headers = dict(self._headers)
login_headers['Referer'] = self._login_url
resp = self._post(
url=self._login_url,
headers=login_headers,
username=username,
password=password,
**self._get_cookies()
)
self._check_code(resp)
self.check_logged_in()
def _get_analyze_page_root(self, survey_url):
self.check_logged_in()
headers = dict(self._headers)
headers['Referer'] = survey_url
survey_page = self._get(survey_url,
headers=headers,
**self._get_cookies())
self._check_code(survey_page)
return self._page_root(survey_page)
def scrape(self, survey_url):
page = self._get_analyze_page_root(survey_url)
print(page)
def create_db(self, survey_url):
page = self._get_analyze_page_root(survey_url)
print(page)
def log_out(self):
logout_headers = dict(self._headers)
logout_headers['Referer'] = self._home_url
if self._logged_in():
resp = self._get(
url=self._logout_url,
headers=logout_headers,
**self._get_cookies()
)
self._check_code(resp)
else:
logging.log(level=logging.WARNING, msg='Already logged out')
def close(self):
self._session.close()