-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
94 lines (75 loc) · 2.81 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/python3
import requests
import re
import json
import html
'''
Logs in to portal.fhict.nl
Tries to read submissions 2..251, doesn't give a fuck if one fails
Does some shady string operations and summons zalgo
Writes what everyone voted to results.json
'''
headers = {
'Host': 'portal.fhict.nl',
# Set a nice user agent to explain to friendly worried sysadmins why a single client did 252 requests in a few seconds
'User-Agent': 'bef0re\'s robot counting the votes on NEXT week subjects (GitHub version)',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Content-Length': '0',
}
s = requests.session()
s.headers.update(headers)
# Log in
data = {
"curl": "Z2F",
"flags": 0,
"forcedownlevel":0,
"formdir": 6,
"username": "", # Insert your portal.fhict.nl username here
"password": "", # Insert your portal.fhict.nl password here
"trusted":0,
"SubmitCreds":"Log+On",
}
print ("Logging in...")
r = s.post("https://portal.fhict.nl/CookieAuth.dll?Logon",data=data)
ballots = {}
for i in range(2,252):
print ("Requesting page "+str(i))
r = s.post('https://portal.fhict.nl/Studentenplein/Lists/NEXTSubjects/DispForm.aspx?ID='+str(i))
# Cut away everything before and after what we want
start = '<table class="ms-formtable" style="margin-top: 8px;" border="0" cellpadding="0" cellspacing="0" width="100%">'
end = '<tr id="idAttachmentsRow">'
text = r.text[r.text.find(start):r.text.find(end)]
# Get rid of newlines and tabs
text = text.replace("\r\n","")
text = text.replace("\t","")
# Decode htmlentities
text = html.unescape(text)
# Then pick out what we want and summon zalgo at the same time
groups = re.findall('-->(.*?)</td>', text)
'''
groups[0] workshops
groups[1][43:-12] workshops_textbox
groups[2] gastsprekers
groups[3][43:-12] gastsprekers_textbox
groups[4] entertainment
groups[5][43:-12] entertainment_textbox
[43:-12] cuts away some dirt, ehhh HTML.
'''
try:
ballots[i] = {}
ballots[i]['workshops'] = groups[0].split('; ')
ballots[i]['workshops_textbox'] = groups[1][43:-12]
ballots[i]['gastsprekers'] = groups[2].split('; ')
ballots[i]['gastsprekers_textbox'] = groups[3][43:-12]
ballots[i]['entertainment'] = groups[4].split('; ')
ballots[i]['entertainment_textbox'] = groups[5][43:-12]
except IndexError:
print ("Warning: page " + str(i) + " gave an IndexError")
# parsing didn't work.
# To prevent gastsprekers from leaking into entertainment, we need to cleanse this ballot
ballots[i] = {}
print ("Writing everything to results.json:")
with open("results.json",'w') as f:
print (json.dumps(ballots), file=f)