forked from jamezpolley/aec_postcode_electorate_data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
67 lines (56 loc) · 2.7 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful
import requests
import os
from bs4 import BeautifulSoup
os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///data.sqlite'
import scraperwiki
def scrape_postcode(postcode):
urltemplate = "https://electorate.aec.gov.au/LocalitySearchResults.aspx?filter={}&filterby=Postcode"
url = str.format(urltemplate, postcode)
pageno = 1
page = requests.get(url).content
while True:
bs4 = BeautifulSoup(page, "html.parser")
table = bs4.find(id='ContentPlaceHolderBody_gridViewLocalities')
# Occasionally (e.g. postcode "2250") a page will have exactly 20 items on it but no next page
# So if this happens just bail out
if table is None:
return
rows = table.find_all('tr', recursive=False)
for row in rows:
# Skip header and footer of table because they're making tables like it's the 2000s
if row.attrs == {}:
tds = row.find_all('td')
if len(tds) == 6:
rowdata = {'state': tds[0].text, 'suburb': tds[1].text, 'postcode': tds[2].text, 'electorate': tds[3].text, 'redistributed': tds[4].text, 'other': tds[5].text}
scraperwiki.sqlite.save(unique_keys=('state', 'suburb', 'postcode', 'electorate'), data=rowdata, table_name='data')
else:
print("WARNING: No results for postcode {}".format(postcode))
if len(rows) < 22:
return
pageno += 1
# Get the next page using the ASP.net postback
data = {
"__EVENTTARGET": "ctl00$ContentPlaceHolderBody$gridViewLocalities",
"__EVENTARGUMENT": "Page${}".format(pageno),
"__VIEWSTATEENCRYPTED": "",
"__VIEWSTATE": bs4.find(id="__VIEWSTATE")["value"],
"__VIEWSTATEGENERATOR": bs4.find(id="__VIEWSTATEGENERATOR")["value"],
"__EVENTVALIDATION": bs4.find(id="__EVENTVALIDATION")["value"],
}
page = requests.post(url, data=data).content
# Oh lordy. The AEC website appears to be blocking morph.io.
# Use our proxy if the MORPH_AUSTRALIAN_PROXY environment variable is set
if "MORPH_AUSTRALIAN_PROXY" in os.environ:
print("Using proxy setting in MORPH_AUSTRALIAN_PROXY...")
os.environ['HTTP_PROXY'] = os.environ['MORPH_AUSTRALIAN_PROXY']
os.environ['HTTPS_PROXY'] = os.environ['MORPH_AUSTRALIAN_PROXY']
postcodes = open("postcodes", "r").readlines()
total_codes = len(postcodes)
i = 0
for postcode in postcodes:
i+=1
postcode = postcode.strip()
print("{}: {}/{}".format(postcode, i, total_codes))
scrape_postcode(postcode)