Switch to different way of parsing, want them outside speeches. Add T…

…WFY bit
mysociety · Apr 23, 2024 · 2b9c9f3 · 2b9c9f3
1 parent 042d2ad
commit 2b9c9f3
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 28 deletions.
diff --git a/pyscraper/sp_2024/convert.py b/pyscraper/sp_2024/convert.py
@@ -77,6 +77,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
     title = source.get("title")
     iso_date = source.get("date")
     source_id = int(float(source.get("id")[1:]))
+    timestamp = ""
 
     # remove [Draft] from title
     title = title.replace("[Draft]", "").strip()
@@ -117,12 +118,33 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
                 speech.set("id", id_factory.get_next_minor_id())
                 speech.set("url", subitem.get("speech_url") or "")
                 speech.set("speakername", speaker_name)
+                if timestamp:
+                    speech.set("time", timestamp)
                 speech.set("person_id", person_id or "unknown")
                 for child in subitem:
                     speech.append(child)
                 root.append(speech)
                 previous_speech = speech
 
+            elif subitem.tag == "timestamp":
+                if m := re.match('\s*(\d\d:\d\d)(.*)', subitem.text):
+                    timestamp = m.group(1)
+                    text = m.group(2)
+                else:
+                    text = subitem.text
+                text = text.replace("\xa0", " ").strip()
+                if text:
+                    p = etree.Element("p")
+                    p.set("class", "italic")
+                    p.text = text
+                    speech = etree.Element("speech")
+                    speech.set("id", id_factory.get_next_minor_id())
+                    speech.set("url", subitem.get("speech_url") or "")
+                    if timestamp:
+                        speech.set("time", timestamp)
+                    speech.append(p)
+                    root.append(speech)
+
             elif subitem.tag == "heading":
                 minor_heading = etree.Element("minor-heading")
                 minor_heading.set("id", id_factory.get_next_minor_id())

diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py
@@ -5,9 +5,10 @@
 
 from __future__ import annotations
 
+import re
 from pathlib import Path
 
-from bs4 import BeautifulSoup, Tag, NavigableString
+from bs4 import BeautifulSoup, Tag
 
 # HTML elements we accept moving from raw_html to parsed
 acceptable_elements = [
@@ -61,7 +62,6 @@
     "tt",
     "u",
     "ul",
-    "timestamp"
 ]
 
 
@@ -71,7 +71,11 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup:
     This isn't yet matching TWFY schema or using the right IDs.
     The goal is to make a structured file that's a bit easier to work with.
     """
-    soup = BeautifulSoup(str(raw_html), "html.parser")
+
+    # Deal with timestamps that are not inside anything first
+    raw_html = str(raw_html)
+    raw_html = re.sub('(?m)^\s*(.*?)\s*<br/>\s*<br/>', r'<timestamp>\1</timestamp>', raw_html)
+    soup = BeautifulSoup(raw_html, "html.parser")
 
     # convert a structure where there's a strong tag with an a inside to a speech tag
 
@@ -139,31 +143,13 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup:
     # sequential tags from an acceptable element should be grouped together under the speech
     # to create the speech object
     for speaker in soup.find_all("speech"):
-        next_sibling = speaker.next_sibling
-        while next_sibling:
-            if isinstance(next_sibling, NavigableString) and str(next_sibling).strip():
-                nt = soup.new_tag("timestamp")
-                # replace all weird whitespace with normal space
-                nt.string = next_sibling.replace("\xa0", " ").strip()
-                ns = next_sibling.next_sibling
-                while ns and (ns.name == 'br' or (isinstance(ns, NavigableString) and not ns.strip())):
-                    next_ns = ns.next_sibling
-                    ns.extract()
-                    ns = next_ns
-                nt.next_sibling = ns
-                # delete original navigablestring
-                next_sibling.extract()
-                next_sibling = nt
-            if next_sibling.name in acceptable_elements:
-                # if the class is 'or-contribution-box' remove that class
-                if next_sibling.get("class") == ["or-contribution-box"]:
-                    del next_sibling["class"]
-                speaker.append(next_sibling)
-                next_sibling = speaker.next_sibling
-            elif next_sibling.name == "speech":
-                break
-            else:
-                next_sibling = next_sibling.next_sibling
+        next_sibling = speaker.find_next_sibling()
+        while next_sibling and next_sibling.name in acceptable_elements:
+            # if the class is 'or-contribution-box' remove that class
+            if next_sibling.get("class") == ["or-contribution-box"]:
+                del next_sibling["class"]
+            speaker.append(next_sibling)
+            next_sibling = speaker.find_next_sibling()
 
     # now, in each speech - we want to iterate through and check for a p tag that's just 'For' or 'Against'
     # if so the next sibling will be a list of speakers seperated by <br/>