Skip to content

Commit

Permalink
[SP] parse wrans into interim XML
Browse files Browse the repository at this point in the history
  • Loading branch information
struan committed Aug 20, 2024
1 parent ffcd763 commit c55c9c3
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 0 deletions.
8 changes: 8 additions & 0 deletions pyscraper/sp_2024/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .convert import convert_xml_to_twfy
from .download import fetch_debates_for_dates, fetch_wrans_for_dates
from .parse import tidy_up_html
from .parse_wrans import tidy_up_wrans_html

file_dir = Path(__file__).parent
parldata = Path(file_dir, "..", "..", "..", "parldata")
Expand Down Expand Up @@ -167,6 +168,13 @@ def wrans(
for file in file_iterator:
pass

if parse:
file_iterator = cache_dir_iterator(download_dir, start, end, partial_file_name)
for file in file_iterator:
if verbose:
print(f"Parsing up {file}")
tidy_up_wrans_html(file, parsed_dir)


if __name__ == "__main__":
cli(prog_name="python -m pyscraper.sp_2024")
155 changes: 155 additions & 0 deletions pyscraper/sp_2024/parse_wrans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
"""
This module contains tools to convert the unstructured HTML of the debates into structured XML.
This is not the TWFY style XML - but tries to retain all information from the original.
"""

from __future__ import annotations

import re
from pathlib import Path

from bs4 import BeautifulSoup, Tag

# HTML elements we accept moving from raw_html to parsed
acceptable_elements = [
"a",
"abbr",
"acronym",
"address",
"b",
"big",
"blockquote",
"br",
"caption",
"center",
"cite",
"col",
"colgroup",
"dd",
"dir",
"div",
"dl",
"dt",
"em",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"i",
"img",
"li",
"ol",
"p",
"pre",
"q",
"s",
"small",
"span",
"strike",
"strong",
"sub",
"sup",
"table",
"tbody",
"td",
"tfoot",
"th",
"thead",
"title",
"tr",
"tt",
"u",
"ul",
"timestamp",
]


def process_raw_html(raw_html: Tag, wrans_item_url: str) -> BeautifulSoup:
"""
Given the question html, convert it to a structured xml format
This isn't yet matching TWFY schema or using the right IDs.
The goal is to make a structured file that's a bit easier to work with.
"""

# Deal with timestamps that are not inside anything first
raw_html = str(raw_html)
soup = BeautifulSoup(raw_html, "html.parser")

# convert a structure where there's a question with a question and a reply inside

details = soup.find("ul")
speaker_re = re.compile(r"Asked by:\s*([^,]*),\s*MSP for\s*(\w.*)", re.MULTILINE)
responder_re = re.compile(r".*Answered by\s*(\w.*)\s*on", re.MULTILINE | re.DOTALL)
lodged_re = re.compile(r"Date lodged:\s*(\d+ \w+ \d+)", re.MULTILINE)
for li in details.find_all("li"):
text = li.text.strip()

speaker_match = re.match(speaker_re, text)
responder_match = re.match(responder_re, text)
lodged_match = re.match(lodged_re, text)

tag = None
match = None

if speaker_match:
tag = soup.new_tag("speaker")
speaker = f"{speaker_match.group(1)}, {speaker_match.group(2)}"
tag.append(speaker)
elif responder_match:
tag = soup.new_tag("responder")
tag.append(responder_match.group(1))
elif lodged_match:
tag = soup.new_tag("lodged")
tag.append(lodged_match.group(1))

if tag:
li.replace_with(tag)
else:
print(text)
li.decompose()

for h in soup.find_all("h3"):
text = h.find_next("div")
tag = None
if h.strong.string.strip() == "Question":
tag = soup.new_tag("question")
elif h.strong.string.strip() == "Answer":
tag = soup.new_tag("answer")
if tag:
h.replace_with(tag)
tag.append(text)

soup.find("raw_html").name = "parsed"

return soup


def tidy_up_wrans_html(xml_path: Path, output_dir: Path):
"""
For each subsection there is a raw_html child
This function will convert the raw_html element to a parsed child.
This can be rerun on already downloaded data.
"""

with xml_path.open("r") as f:
xml = f.read()

soup = BeautifulSoup(xml, "html.parser")

for item in soup.find_all("question"):
wrans_item_url = item.get("url")

# process html
raw_html = item.find("raw_html")
parsed_data = process_raw_html(raw_html, wrans_item_url=wrans_item_url)
# replace raw_html with parsed
item.find("raw_html").decompose()
item.append(parsed_data.find("parsed"))

# dump the soup to a file
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / xml_path.name
with output_file.open("w") as f:
f.write(soup.prettify())

0 comments on commit c55c9c3

Please sign in to comment.