Skip to content

Commit

Permalink
fixup! Add scraper for new Scottish Parliament site
Browse files Browse the repository at this point in the history
  • Loading branch information
ajparsons committed Apr 23, 2024
1 parent 012baa0 commit 289c0af
Showing 1 changed file with 22 additions and 27 deletions.
49 changes: 22 additions & 27 deletions pyscraper/sp_2024/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@

from pathlib import Path

from bs4 import BeautifulSoup
from lxml import etree

from bs4 import BeautifulSoup, Tag

# HTML elements we accept moving from raw_html to parsed
acceptable_elements = [
Expand Down Expand Up @@ -66,13 +64,13 @@
]


def process_raw_html(html: str, agenda_item_url: str):
def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup:
"""
Given the debate html, convert it to a structured xml format
This isn't yet matching TWFY schema or using the right IDs.
The goal is to make a structured file that's a bit easier to work with.
"""
soup = BeautifulSoup(html, "html.parser")
soup = BeautifulSoup(str(raw_html), "html.parser")

# convert a structure where there's a strong tag with an a inside to a speech tag

Expand Down Expand Up @@ -223,8 +221,7 @@ def process_raw_html(html: str, agenda_item_url: str):
new_speech.extend(after)
division.insert_after(new_speech)

# convert soup into etree
return etree.fromstring(str(soup))
return soup


def tidy_up_html(xml_path: Path):
Expand All @@ -237,27 +234,25 @@ def tidy_up_html(xml_path: Path):
with xml_path.open("r") as f:
xml = f.read()

root = etree.fromstring(xml)
soup = BeautifulSoup(xml, "html.parser")

for item in root.iter("agenda_item"):
for item in soup.find_all("agenda_item"):
agenda_item_url = item.get("url")
# get the raw_html element
raw_html = item.find("raw_html")
# convert the contents of the raw_html element to a string
raw_html_str = etree.tostring(raw_html, pretty_print=True).decode("utf-8")

# delete the any 'parsed' child of the subsection element
for child in item:
if child.tag == "parsed":
item.remove(child)

# add a new parsed element
parsed = etree.Element("parsed")
for child in process_raw_html(raw_html_str, agenda_item_url=agenda_item_url):
parsed.append(child)
item.append(parsed)
# delete any 'parsed' child of the subsection element
for child in item.find_all("parsed"):
child.decompose()

etree.indent(root, space=" ")

with xml_path.open("wb") as f:
f.write(etree.tostring(root, pretty_print=True))
# get the raw_html element
raw_html = item.find("raw_html")
parsed = process_raw_html(raw_html, agenda_item_url=agenda_item_url)
# add this as an element under the soup item
parsed_element = soup.new_tag("parsed")
item.append(parsed_element)
# moved all children of the raw_html element in parsed to the parsed element
for child in parsed.find("raw_html").children:
parsed_element.append(child)

# dump the soup to a file
with xml_path.open("w") as f:
f.write(soup.prettify())

0 comments on commit 289c0af

Please sign in to comment.