Skip to content

Commit

Permalink
Switch to different way of parsing, want them outside speeches. Add T…
Browse files Browse the repository at this point in the history
…WFY bit
  • Loading branch information
dracos committed Apr 23, 2024
1 parent 042d2ad commit 2b9c9f3
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 28 deletions.
22 changes: 22 additions & 0 deletions pyscraper/sp_2024/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
title = source.get("title")
iso_date = source.get("date")
source_id = int(float(source.get("id")[1:]))
timestamp = ""

# remove [Draft] from title
title = title.replace("[Draft]", "").strip()
Expand Down Expand Up @@ -117,12 +118,33 @@ def convert_xml_to_twfy(file_path: Path, output_dir: Path, verbose: bool = False
speech.set("id", id_factory.get_next_minor_id())
speech.set("url", subitem.get("speech_url") or "")
speech.set("speakername", speaker_name)
if timestamp:
speech.set("time", timestamp)
speech.set("person_id", person_id or "unknown")
for child in subitem:
speech.append(child)
root.append(speech)
previous_speech = speech

elif subitem.tag == "timestamp":
if m := re.match('\s*(\d\d:\d\d)(.*)', subitem.text):
timestamp = m.group(1)
text = m.group(2)
else:
text = subitem.text
text = text.replace("\xa0", " ").strip()
if text:
p = etree.Element("p")
p.set("class", "italic")
p.text = text
speech = etree.Element("speech")
speech.set("id", id_factory.get_next_minor_id())
speech.set("url", subitem.get("speech_url") or "")
if timestamp:
speech.set("time", timestamp)
speech.append(p)
root.append(speech)

elif subitem.tag == "heading":
minor_heading = etree.Element("minor-heading")
minor_heading.set("id", id_factory.get_next_minor_id())
Expand Down
42 changes: 14 additions & 28 deletions pyscraper/sp_2024/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@

from __future__ import annotations

import re
from pathlib import Path

from bs4 import BeautifulSoup, Tag, NavigableString
from bs4 import BeautifulSoup, Tag

# HTML elements we accept moving from raw_html to parsed
acceptable_elements = [
Expand Down Expand Up @@ -61,7 +62,6 @@
"tt",
"u",
"ul",
"timestamp"
]


Expand All @@ -71,7 +71,11 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup:
This isn't yet matching TWFY schema or using the right IDs.
The goal is to make a structured file that's a bit easier to work with.
"""
soup = BeautifulSoup(str(raw_html), "html.parser")

# Deal with timestamps that are not inside anything first
raw_html = str(raw_html)
raw_html = re.sub('(?m)^\s*(.*?)\s*<br/>\s*<br/>', r'<timestamp>\1</timestamp>', raw_html)
soup = BeautifulSoup(raw_html, "html.parser")

# convert a structure where there's a strong tag with an a inside to a speech tag

Expand Down Expand Up @@ -139,31 +143,13 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup:
# sequential tags from an acceptable element should be grouped together under the speech
# to create the speech object
for speaker in soup.find_all("speech"):
next_sibling = speaker.next_sibling
while next_sibling:
if isinstance(next_sibling, NavigableString) and str(next_sibling).strip():
nt = soup.new_tag("timestamp")
# replace all weird whitespace with normal space
nt.string = next_sibling.replace("\xa0", " ").strip()
ns = next_sibling.next_sibling
while ns and (ns.name == 'br' or (isinstance(ns, NavigableString) and not ns.strip())):
next_ns = ns.next_sibling
ns.extract()
ns = next_ns
nt.next_sibling = ns
# delete original navigablestring
next_sibling.extract()
next_sibling = nt
if next_sibling.name in acceptable_elements:
# if the class is 'or-contribution-box' remove that class
if next_sibling.get("class") == ["or-contribution-box"]:
del next_sibling["class"]
speaker.append(next_sibling)
next_sibling = speaker.next_sibling
elif next_sibling.name == "speech":
break
else:
next_sibling = next_sibling.next_sibling
next_sibling = speaker.find_next_sibling()
while next_sibling and next_sibling.name in acceptable_elements:
# if the class is 'or-contribution-box' remove that class
if next_sibling.get("class") == ["or-contribution-box"]:
del next_sibling["class"]
speaker.append(next_sibling)
next_sibling = speaker.find_next_sibling()

# now, in each speech - we want to iterate through and check for a p tag that's just 'For' or 'Against'
# if so the next sibling will be a list of speakers seperated by <br/>
Expand Down

0 comments on commit 2b9c9f3

Please sign in to comment.