Big bag of work in progress in one big commit :/

DemocracyClub · Jun 12, 2021 · 693ffb8 · 693ffb8
1 parent a25155f
commit 693ffb8
Show file tree

Hide file tree

Showing 12 changed files with 432 additions and 64 deletions.
diff --git a/Pipfile b/Pipfile
@@ -21,6 +21,7 @@ python-dateutil = "*"
 retry = "*"
 black = "*"
 rich = "*"
+arcgis2geojson = "*"
 
 [dev-packages]
 pytest-mypy-plugins = "*"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/lgsf/conf/__init__.py b/lgsf/conf/__init__.py
@@ -23,6 +23,7 @@ def __init__(self):
             "councillors",
             "templates",
             "metadata",
+            "polling_stations"
             # 'parties',
             # "scrapers",
             # 'reconcilers',

diff --git a/lgsf/polling_stations/__init__.py b/lgsf/polling_stations/__init__.py
diff --git a/lgsf/polling_stations/commands.py b/lgsf/polling_stations/commands.py
@@ -0,0 +1,101 @@
+from rich.progress import Progress
+
+from lgsf.commands.base import PerCouncilCommandBase
+from lgsf.path_utils import load_scraper, load_council_info
+
+
+
+class Command(PerCouncilCommandBase):
+    command_name = "polling_stations"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--check-only",
+            action="store_true",
+            help="Just check for updated pages, don't scrape anything",
+        )
+        parser.add_argument(
+            "--list-missing",
+            action="store_true",
+            help="Print missing councils",
+        )
+        parser.add_argument(
+            "--list-disabled",
+            action="store_true",
+            help="Print disabled councils",
+        )
+
+    def _run_single(self, scraper, progress, summary):
+        try:
+            progress.console.print(
+                scraper.options["council"]
+            )
+            scraper.run()
+            summary["completed"] += 1
+        except KeyboardInterrupt:
+            raise
+        except:
+            if self.options.get("verbose"):
+                raise
+            summary["failed"] += 1
+            progress.console.print(
+                "Error running asdasd {}, see {} for more".format(
+                    self.options["council"], scraper._error_file_name()
+                ),
+                style="red",
+            )
+
+
+    def handle(self, options):
+        self.options = options
+        if options["list_missing"]:
+            self.output_missing()
+
+        if options["list_disabled"]:
+            self.output_disabled()
+
+        self.output_status()
+
+        self.normalise_codes()
+        to_run = self.councils_to_run()
+        summary = {
+            "completed": 0,
+            "missing scraper": 0,
+            "failed": 0,
+            "skipped": 0,
+        }
+        with Progress() as progress:
+            tasks = {
+                "total": progress.add_task(description=f"Total", total=len(to_run)),
+            }
+
+            while not progress.finished:
+                for council in to_run:
+                    self.options["council"] = council
+                    self.options["council_info"] = load_council_info(council)
+                    scraper_cls = load_scraper(council, self.command_name)
+                    if not scraper_cls:
+                        summary["missing scraper"] += 1
+                        continue
+                    with scraper_cls((self.options), progress.console) as scraper:
+                        should_run = True
+                        if scraper.disabled:
+                            should_run = False
+
+                        if should_run and options["refresh"]:
+                            if scraper.run_since():
+                                should_run = False
+
+                        if should_run and options["tags"]:
+                            required_tags = set(options["tags"].split(","))
+                            scraper_tags = set(scraper.get_tags)
+                            if not required_tags.issubset(scraper_tags):
+                                should_run = False
+
+                        if should_run:
+                            self._run_single(scraper, progress, summary)
+                        else:
+                            summary["skipped"] += 1
+
+                    progress.update(tasks["total"], advance=1)
+        self.console.print(summary)
diff --git a/lgsf/polling_stations/models.py b/lgsf/polling_stations/models.py
@@ -0,0 +1,26 @@
+import json
+
+class PollingStationsList:
+    def __init__(
+        self, stations
+    ):
+        self.stations = stations
+
+    def as_file_name(self):
+        return "stations"
+
+    def as_json(self):
+        return json.dumps(self.stations, indent=4)
+
+
+class PollingDistrictsList:
+    def __init__(
+        self, districts
+    ):
+        self.districts = districts
+
+    def as_file_name(self):
+        return "districts"
+
+    def as_json(self):
+        return json.dumps(self.districts, indent=4)
diff --git a/lgsf/polling_stations/scrapers/__init__.py b/lgsf/polling_stations/scrapers/__init__.py
diff --git a/lgsf/polling_stations/scrapers/arcgis_scraper.py b/lgsf/polling_stations/scrapers/arcgis_scraper.py
@@ -0,0 +1,57 @@
+import json
+from arcgis2geojson import arcgis2geojson
+
+from lgsf.polling_stations.models import PollingStationsList, PollingDistrictsList
+from lgsf.scrapers.base import ScraperBase
+from lgsf.polling_stations.scrapers.common import (
+    # BaseScraper,
+    # get_data_from_url,
+    # save,
+    summarise,
+    sync_db_to_github,
+    truncate, PollingStationScraperBase,
+)
+
+class ArcGisScraper(PollingStationScraperBase):
+
+    def make_geometry(self, feature):
+        return json.dumps(arcgis2geojson(feature), sort_keys=True)
+
+    def get_data(self, url):  # pragma: no cover
+        response = self.get(url)
+        data_str = response.content
+        data = json.loads(data_str.decode(self.encoding))
+        return (data_str, data)
+
+
+    def process_feature(self, feature, fields=None):
+        # assemble record
+        record = {
+            'council_id': self.council_id,
+            'geometry': self.make_geometry(feature),
+        }
+        for field in fields:
+            value = feature['attributes'][field['name']]
+            if isinstance(value, str):
+                record[field['name']] = value.strip()
+            else:
+                record[field['name']] = value
+        return record
+
+
+    def scrape(self, url, type='features'):
+        # load json
+        data_str, data = self.get_data(url)
+        print(f"found {len(data['features'])} {type}")
+
+        # grab field names
+        fields = data['fields']
+        features = data['features']
+
+        return self.process_features(features, fields)
+
+
+        # print summary
+        # summarise(self.table)
+
+        # self.store_history(data_str, self.council_id)