Skip to content

Commit

Permalink
feat(okla): Improve content cleanup
Browse files Browse the repository at this point in the history
Version Bump to v2.6.35
  • Loading branch information
flooie committed Oct 22, 2024
1 parent 147b0f5 commit e00485a
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 7 deletions.
7 changes: 6 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,18 @@ Releases are also tagged in git, if that's helpful.

## Current

**2.6.34 - 2024-10-22**
**2.6.35 - 2024-10-22**

Fixes:
- Fix for `okla` cleanup_content

## Past

**2.6.34 - 2024-10-22**

Fixes:
- Fix for `okla` cleanup_content

**2.6.32 - 2024-10-21**

Features:
Expand Down
20 changes: 15 additions & 5 deletions juriscraper/opinions/united_states/state/okla.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,26 @@ def _process_html(self):
def cleanup_content(content):
"""Remove non-opinion HTML
Oklahoma uses ISO-8859-1 formatting which we need to account for
so we dont end up with ugly HTML. Also we should remove a few sections
and all of the A tags to avoid hyperlinking to nowhere.
:param content: The scraped HTML
:return: Cleaner HTML
"""
tree = strip_bad_html_tags_insecure(str(content), remove_scripts=True)
for removal_class in ["tmp-citationizer", "footer"]:
content = content.decode("ISO-8859-1")
tree = strip_bad_html_tags_insecure(content, remove_scripts=True)
for removal_class in ["tmp-citationizer", "footer", "published-info"]:
for element in tree.xpath(f"//div[@class='{removal_class}']"):
parent = element.getparent()
if parent is not None:
parent.remove(element)

for a_tag in tree.xpath("//a"):
span = html.Element("span")
span.text = a_tag.text
a_tag.getparent().replace(a_tag, span)

opinions_navigation = tree.xpath("//div[@id='opinons-navigation']")
if opinions_navigation:
opinions_navigation = opinions_navigation[0]
Expand All @@ -70,6 +80,6 @@ def cleanup_content(content):

# Find the core element with id 'oscn-content'
core_element = tree.xpath("//*[@id='oscn-content']")[0]
return html.tostring(
core_element, pretty_print=True, encoding="unicode"
).encode("utf-8")
html_content = html.tostring(core_element).decode("ISO-8859-1")

return html_content.strip()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from setuptools import find_packages, setup
from setuptools.command.install import install

VERSION = "2.6.34"
VERSION = "2.6.35"
AUTHOR = "Free Law Project"
EMAIL = "[email protected]"
HERE = os.path.abspath(os.path.dirname(__file__))
Expand Down

0 comments on commit e00485a

Please sign in to comment.