-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgradcafeutility.py
176 lines (151 loc) · 7.01 KB
/
gradcafeutility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
'''
This program scrapes the GradCafe website using the user-provided options and
if any new results are found, emails the user
'''
import sys
import functions as func
from urllib import parse
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import os
import traceback
def main():
# get the previously saved data and raise an error if nothing was saved
variables = func.get_data(path)
if variables == None or len(variables) < 1:
raise ValueError("Answers not saved.")
# format the search string for the fields and schools
school_string = get_string(variables["SCHOOLS"])
field_string = get_string(variables["FIELDS"])
# set up the request URL and ensure it's URL encoded
# t=t tells the website to look only at results from the last 2 days
base_url = 'https://www.thegradcafe.com/survey/index.php?t=t&q='
search_query = school_string + ' ' + field_string
request_url = base_url + parse.quote_plus(search_query)
# while it is probably unlikely most search queries will have more than a page
# of results in only 2 days, this loop checks to see if there are
# in which case it will request the next page of results and process those
# until there is less than a page of results left
yesterdays_results = []
page = 1
while True:
new_url = request_url + '&p=' + str(page)
# this gets all the results from the current page
all_results = get_results(new_url)
# this adds any results from yesterday to the list containing the results
# from all the pages
for r in all_results["yesterday"]:
yesterdays_results.append(r)
# if there is less than a full page worth of results, break the loop
if len(all_results["all"]) < 25:
break
# increment page count if there are potentially more results
page += 1
# if any results from yesterday were found, send the email
if len(yesterdays_results) > 0:
# create text version of email
text = "Check the website for more info\n\n"+request_url
# this creates the html version of the email by creating a table with all the results
html = '<html><head></head><body>'
html += '<p><a href="'+request_url+'">Check the website for more info</a></p>'
html += '<table>'
for fld in yesterdays_results:
html += '<tr>'
for f in fld:
html += str(f)
html += '</tr>'
html += '</table></body></html>'
# create the email server and send the email
valid_connection = func.setup_email(variables["EMAIL"])
if valid_connection["valid"]:
server = valid_connection["server"]
# create email header
msg = MIMEMultipart('alternative')
msg['Subject'] = 'New GradCafe Admissions Results'
msg['From'] = valid_connection["username"]
msg['To'] = variables["EMAIL"]
# set the text/plain and text/html parts of the email and attach them to message
part1 = MIMEText(text, 'plain')
part2 = MIMEText(html, 'html')
msg.attach(part1)
msg.attach(part2)
server.sendmail(valid_connection["username"], variables["EMAIL"], msg.as_string())
server.quit()
else:
func.log_error(path,"Utility",valid_connection["message"],valid_connection["trace"])
else:
# while not an error, this simply keeps track of every execution of the program
func.log_error(path,"Utility","No new results posted.",'')
# this converts the field and school lists to the proper OR syntax recognized by GradCafe
def get_string(data):
if len(data) > 0:
for i, s in enumerate(data):
# input with spaces needs to have quotes around it to ensure the entire phrase
# is properly searched
if len(s.split(" ")) > 1:
data[i] = '"' + s + '"'
string = '(' + '|'.join(data) + ')'
else:
# if no fields or schools are specified, don't add anything to the query
string = ''
return(string)
# this submits the GET request to the URL and handles the results
# finding all results and parsing them to find yesterday's were separated into
# different functions so that they can be used separately in the while loop
def get_results(request_url):
r = requests.get(request_url)
r.raise_for_status()
response = {}
response["all"] = find_results(r.text)
response["yesterday"] = parse_results(response["all"])
return(response)
def find_results(text):
# parse the HTML and find the table with the results
soup = BeautifulSoup(text, "lxml")
result_table = soup.find("table", {"class": "submission-table"})
# raise an error if the main table isn't found or if there are no rows in the table
# otherwise return all the rows of the table
if result_table == None:
no_result = soup.find("section", {"class": "submissions"}).get_text().strip()
raise ValueError(no_result)
else:
results = result_table.find_all("tr")
if len(results) < 1:
raise ValueError("No results found.")
return(results)
# this iterates through the results found above to find any posted yesterday
def parse_results(results):
new_results = []
for res in results:
# find all the columns in each row and iterate over them
fields = res.find_all("td")
if len(fields) > 0:
for f in fields:
# if the column contains the date it was posted, extract the date
# and check whether it is yesterday's date
# if it is, then add it to the list
if "datecol" in f.get("class"):
date = datetime.strptime(f.get_text(), '%d %b %Y').date()
if date == yesterday:
new_results.append(fields)
return(new_results)
if __name__ == "__main__":
# wrap the program in a try/except block and log any errors
try:
# all scheduled tasks are run in C:\Windows\System32 so they don't have access to
# the original files, therefore the file path was added as an argument to the task
# this extracts the argument so the saved data can be found
# otherwise log an error by creating a log file in the user's Documents folder
if len(sys.argv) > 1:
path = sys.argv[1]
else:
path = os.path.expanduser("~\Documents")
raise ValueError("No file path argument given.")
# calculate what day yesterday here in case there are multiple pages of results
yesterday = datetime.today().date() - timedelta(days=1)
main()
except Exception as e:
func.log_error(path,"Utility",e,traceback.format_exc())