Skip to content

Commit

Permalink
[SP] convert wrans intermediate XML to PW format
Browse files Browse the repository at this point in the history
  • Loading branch information
struan committed Aug 21, 2024
1 parent 3284a30 commit 6aa1784
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 0 deletions.
9 changes: 9 additions & 0 deletions pyscraper/sp_2024/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import click

from .convert import convert_xml_to_twfy
from .convert_wrans import convert_wrans_xml_to_twfy
from .download import fetch_debates_for_dates, fetch_wrans_for_dates
from .parse import tidy_up_html
from .parse_wrans import tidy_up_wrans_html
Expand All @@ -22,6 +23,7 @@
download_dir = parldata / "cmpages" / "sp_2024" / "raw"
parsed_dir = parldata / "cmpages" / "sp_2024" / "parsed"
output_dir = parldata / "scrapedxml" / "sp-new"
output_dir_wrans = parldata / "scrapedxml" / "sp-written"


@click.group()
Expand Down Expand Up @@ -175,6 +177,13 @@ def wrans(
print(f"Parsing up {file}")
tidy_up_wrans_html(file, parsed_dir)

if convert:
file_iterator = cache_dir_iterator(parsed_dir, start, end, partial_file_name)
for file in file_iterator:
if verbose:
print(f"Converting {file} to TheyWorkForYou format")
convert_wrans_xml_to_twfy(file, output_dir_wrans, verbose=verbose)


if __name__ == "__main__":
cli(prog_name="python -m pyscraper.sp_2024")
146 changes: 146 additions & 0 deletions pyscraper/sp_2024/convert_wrans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""
Convert the structured data from Scottish Parliament to
the XML format used by TheyWorkForYou
Link to TWFY IDs for members.
"""

import datetime
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

from lxml import etree

from .resolvenames import get_unique_person_id, is_member_vote


@dataclass
class IDFactory:
committee_slug: str
iso_date: str
base_id: str = "uk.org.publicwhip/spwa/"
latest_major: int = -1
latest_minor: int = -1

def _current_id(self) -> str:
return f"{self.base_id}{self.iso_date}.{self.latest_major}.{self.latest_minor}"

def get_next_major_id(self) -> str:
self.latest_major += 1
self.latest_minor = 0
return self._current_id()

def get_next_minor_id(self) -> str:
self.latest_minor += 1
return self._current_id()


def convert_wrans_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False):
"""
Convert from the loose structured xml format to the
TWFY xml format
"""
if verbose:
print(f"Converting {file_path}")

# get source as an xml tree
with file_path.open("r") as f:
source = etree.fromstring(f.read())

# root of the tree is a publicwhip object
root = etree.Element("publicwhip")

iso_date = source.get("date")

# get the date in format Thursday 9 June 2005
date_str = datetime.date.fromisoformat(iso_date).strftime("%A %d %B %Y")

committee_slug = "sp-written"

dest_path = output_dir / committee_slug / f"spwa{iso_date}.xml"
dest_path.parent.mkdir(parents=True, exist_ok=True)

id_factory = IDFactory(committee_slug=committee_slug, iso_date=iso_date)

# there is only questions for today
major_heading = etree.Element("major-heading")
major_heading.set("id", id_factory.get_next_major_id())
major_heading.set("nospeaker", "True")
# major_heading.set("url", item.get("url"))
major_heading.text = f"Written Questions for {date_str}"
root.append(major_heading)

# iterate through the questions
for item in source.iter("spwrans"):
# each question is a minor heading using the id as the title because
# we don't have anything else to use
minor_heading = etree.Element("minor-heading")
minor_heading.set("id", id_factory.get_next_minor_id())
minor_heading.text = f"Question {item.get('id')}"
root.append(minor_heading)

missing_speakers = []
for subitem in item.find("parsed"):
if subitem.tag == "question":
speaker_name = subitem.get("speaker_name")
person_id = get_unique_person_id(speaker_name, iso_date)
if (
person_id is None
and speaker_name not in missing_speakers
and verbose
):
print(f"Could not find person id for {speaker_name}")
missing_speakers.append(speaker_name)
speech = etree.Element("ques")
speech.set("id", id_factory.get_next_minor_id())
speech.set("url", item.get("url") or "")
speech.set("speakername", speaker_name)
speech.set("person_id", person_id or "unknown")
for child in subitem:
speech.append(child)
root.append(speech)

elif subitem.tag == "answer":
speaker_name = subitem.get("speaker_name")
person_id = get_unique_person_id(speaker_name, iso_date)
if (
person_id is None
and speaker_name not in missing_speakers
and verbose
):
print(f"Could not find person id for {speaker_name}")
missing_speakers.append(speaker_name)
speech = etree.Element("reply")
speech.set("id", id_factory.get_next_minor_id())
speech.set("url", item.get("url") or "")
speech.set("speakername", speaker_name)
speech.set("person_id", person_id or "unknown")
for child in subitem:
speech.append(child)
root.append(speech)

# write the new xml to a file
etree.indent(root, space=" ")

with dest_path.open("wb") as f:
f.write(etree.tostring(root, pretty_print=True))


def convert_to_twfy(
cache_dir: Path,
output_dir: Path,
partial_file_name: Optional[str] = None,
verbose: bool = False,
):
"""
Given a cache directory, parse the raw_html elements in the xml files
This updates the 'parsed' element under each agenda-item.
"""
if partial_file_name:
xmls = list(cache_dir.glob(f"{partial_file_name}*"))
else:
xmls = list(cache_dir.glob("*.xml"))
for xml in xmls:
convert_wrans_xml_to_twfy(xml, output_dir, verbose=verbose)

0 comments on commit 6aa1784

Please sign in to comment.