diff --git a/pyscraper/sp_2024/convert.py b/pyscraper/sp_2024/convert.py index 979d37ab..f929f5bc 100644 --- a/pyscraper/sp_2024/convert.py +++ b/pyscraper/sp_2024/convert.py @@ -77,6 +77,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False title = source.get("title") iso_date = source.get("date") source_id = int(float(source.get("id")[1:])) + timestamp = "" # remove [Draft] from title title = title.replace("[Draft]", "").strip() @@ -117,12 +118,33 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False speech.set("id", id_factory.get_next_minor_id()) speech.set("url", subitem.get("speech_url") or "") speech.set("speakername", speaker_name) + if timestamp: + speech.set("time", timestamp) speech.set("person_id", person_id or "unknown") for child in subitem: speech.append(child) root.append(speech) previous_speech = speech + elif subitem.tag == "timestamp": + if m := re.match('\s*(\d\d:\d\d)(.*)', subitem.text): + timestamp = m.group(1) + text = m.group(2) + else: + text = subitem.text + text = text.replace("\xa0", " ").strip() + if text: + p = etree.Element("p") + p.set("class", "italic") + p.text = text + speech = etree.Element("speech") + speech.set("id", id_factory.get_next_minor_id()) + speech.set("url", subitem.get("speech_url") or "") + if timestamp: + speech.set("time", timestamp) + speech.append(p) + root.append(speech) + elif subitem.tag == "heading": minor_heading = etree.Element("minor-heading") minor_heading.set("id", id_factory.get_next_minor_id()) diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py index bcb611f6..498a2cae 100644 --- a/pyscraper/sp_2024/parse.py +++ b/pyscraper/sp_2024/parse.py @@ -5,9 +5,10 @@ from __future__ import annotations +import re from pathlib import Path -from bs4 import BeautifulSoup, Tag, NavigableString +from bs4 import BeautifulSoup, Tag # HTML elements we accept moving from raw_html to parsed acceptable_elements = [ @@ -61,7 +62,6 @@ "tt", "u", "ul", - "timestamp" ] @@ -71,7 +71,11 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup: This isn't yet matching TWFY schema or using the right IDs. The goal is to make a structured file that's a bit easier to work with. """ - soup = BeautifulSoup(str(raw_html), "html.parser") + + # Deal with timestamps that are not inside anything first + raw_html = str(raw_html) + raw_html = re.sub('(?m)^\s*(.*?)\s*
\s*
', r'\1', raw_html) + soup = BeautifulSoup(raw_html, "html.parser") # convert a structure where there's a strong tag with an a inside to a speech tag @@ -139,31 +143,13 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup: # sequential tags from an acceptable element should be grouped together under the speech # to create the speech object for speaker in soup.find_all("speech"): - next_sibling = speaker.next_sibling - while next_sibling: - if isinstance(next_sibling, NavigableString) and str(next_sibling).strip(): - nt = soup.new_tag("timestamp") - # replace all weird whitespace with normal space - nt.string = next_sibling.replace("\xa0", " ").strip() - ns = next_sibling.next_sibling - while ns and (ns.name == 'br' or (isinstance(ns, NavigableString) and not ns.strip())): - next_ns = ns.next_sibling - ns.extract() - ns = next_ns - nt.next_sibling = ns - # delete original navigablestring - next_sibling.extract() - next_sibling = nt - if next_sibling.name in acceptable_elements: - # if the class is 'or-contribution-box' remove that class - if next_sibling.get("class") == ["or-contribution-box"]: - del next_sibling["class"] - speaker.append(next_sibling) - next_sibling = speaker.next_sibling - elif next_sibling.name == "speech": - break - else: - next_sibling = next_sibling.next_sibling + next_sibling = speaker.find_next_sibling() + while next_sibling and next_sibling.name in acceptable_elements: + # if the class is 'or-contribution-box' remove that class + if next_sibling.get("class") == ["or-contribution-box"]: + del next_sibling["class"] + speaker.append(next_sibling) + next_sibling = speaker.find_next_sibling() # now, in each speech - we want to iterate through and check for a p tag that's just 'For' or 'Against' # if so the next sibling will be a list of speakers seperated by