-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
82 lines (63 loc) · 2.12 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import json
import ssl
import urllib.request
from retry import retry
from urllib.error import HTTPError
from dc_base_scrapers.ckan_scraper import CkanScraper
from dc_base_scrapers.geojson_scraper import (
GeoJsonScraper,
RandomIdGeoJSONScraper
)
@retry(HTTPError, tries=2, delay=30)
def get_data_from_url(url):
context = ssl._create_unverified_context()
with urllib.request.urlopen(url, timeout=300, context=context) as response:
data = response.read()
return data
class SalfordCkanScraper(CkanScraper):
def get_data(self):
data_str = get_data_from_url(self.url)
data = json.loads(data_str.decode(self.encoding))
if 'tracking_summary' in data['result']:
del(data['result']['tracking_summary'])
for resource in data['result']['resources']:
if 'tracking_summary' in resource:
del(resource['tracking_summary'])
return (
bytes(json.dumps(data, sort_keys=True, indent=4), 'utf-8'), data)
base_url = 'https://salforddataquay.uk/api/3/action/package_show?id='
stations_info = {
'dataset': 'salford-polling-stations',
'extra_fields': ['revision_timestamp'],
'return_format': 'geojson',
}
districts_info = {
'dataset': 'salford-polling-districts',
'extra_fields': ['revision_timestamp'],
'return_format': 'geojson'
}
council_id = 'SLF'
stations_meta_scraper = SalfordCkanScraper(
base_url,
council_id,
stations_info['dataset'],
stations_info['return_format'],
stations_info['extra_fields'],
'utf-8')
stations_url = stations_meta_scraper.scrape()
districts_meta_scraper = SalfordCkanScraper(
base_url,
council_id,
districts_info['dataset'],
districts_info['return_format'],
districts_info['extra_fields'],
'utf-8')
districts_url = districts_meta_scraper.scrape()
if stations_url:
stations_scraper = RandomIdGeoJSONScraper(
stations_url, council_id, 'utf-8', 'stations')
stations_scraper.scrape()
if districts_url:
districts_scraper = GeoJsonScraper(
districts_url, council_id, 'utf-8', 'districts')
districts_scraper.scrape()