fixup! Add scraper for new Scottish Parliament site

mysociety · Apr 23, 2024 · 289c0af · 289c0af
1 parent 012baa0
commit 289c0af
Showing 1 changed file with 22 additions and 27 deletions.
diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py
@@ -7,9 +7,7 @@
 
 from pathlib import Path
 
-from bs4 import BeautifulSoup
-from lxml import etree
-
+from bs4 import BeautifulSoup, Tag
 
 # HTML elements we accept moving from raw_html to parsed
 acceptable_elements = [
@@ -66,13 +64,13 @@
 ]
 
 
-def process_raw_html(html: str, agenda_item_url: str):
+def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup:
     """
     Given the debate html, convert it to a structured xml format
     This isn't yet matching TWFY schema or using the right IDs.
     The goal is to make a structured file that's a bit easier to work with.
     """
-    soup = BeautifulSoup(html, "html.parser")
+    soup = BeautifulSoup(str(raw_html), "html.parser")
 
     # convert a structure where there's a strong tag with an a inside to a speech tag
 
@@ -223,8 +221,7 @@ def process_raw_html(html: str, agenda_item_url: str):
                 new_speech.extend(after)
                 division.insert_after(new_speech)
 
-    # convert soup into etree
-    return etree.fromstring(str(soup))
+    return soup
 
 
 def tidy_up_html(xml_path: Path):
@@ -237,27 +234,25 @@ def tidy_up_html(xml_path: Path):
     with xml_path.open("r") as f:
         xml = f.read()
 
-    root = etree.fromstring(xml)
+    soup = BeautifulSoup(xml, "html.parser")
 
-    for item in root.iter("agenda_item"):
+    for item in soup.find_all("agenda_item"):
         agenda_item_url = item.get("url")
-        # get the raw_html element
-        raw_html = item.find("raw_html")
-        # convert the contents of the raw_html element to a string
-        raw_html_str = etree.tostring(raw_html, pretty_print=True).decode("utf-8")
-
-        # delete the any 'parsed' child of the subsection element
-        for child in item:
-            if child.tag == "parsed":
-                item.remove(child)
 
-        # add a new parsed element
-        parsed = etree.Element("parsed")
-        for child in process_raw_html(raw_html_str, agenda_item_url=agenda_item_url):
-            parsed.append(child)
-        item.append(parsed)
+        # delete any 'parsed' child of the subsection element
+        for child in item.find_all("parsed"):
+            child.decompose()
 
-    etree.indent(root, space="    ")
-
-    with xml_path.open("wb") as f:
-        f.write(etree.tostring(root, pretty_print=True))
+        # get the raw_html element
+        raw_html = item.find("raw_html")
+        parsed = process_raw_html(raw_html, agenda_item_url=agenda_item_url)
+        # add this as an element under the soup item
+        parsed_element = soup.new_tag("parsed")
+        item.append(parsed_element)
+        # moved all children of the raw_html element in parsed to the parsed element
+        for child in parsed.find("raw_html").children:
+            parsed_element.append(child)
+
+    # dump the soup to a file
+    with xml_path.open("w") as f:
+        f.write(soup.prettify())