feat(okla): Improve content cleanup

Version Bump to v2.6.35
freelawproject · Oct 22, 2024 · e00485a · e00485a
1 parent 147b0f5
commit e00485a
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 7 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -16,13 +16,18 @@ Releases are also tagged in git, if that's helpful.
 
 ## Current
 
-**2.6.34 - 2024-10-22**
+**2.6.35 - 2024-10-22**
 
 Fixes:
   - Fix for `okla` cleanup_content
 
 ## Past
 
+**2.6.34 - 2024-10-22**
+
+Fixes:
+  - Fix for `okla` cleanup_content
+
 **2.6.32 - 2024-10-21**
 
 Features:

diff --git a/juriscraper/opinions/united_states/state/okla.py b/juriscraper/opinions/united_states/state/okla.py
@@ -48,16 +48,26 @@ def _process_html(self):
     def cleanup_content(content):
         """Remove non-opinion HTML
 
+        Oklahoma uses ISO-8859-1 formatting which we need to account for
+        so we dont end up with ugly HTML.  Also we should remove a few sections
+        and all of the A tags to avoid hyperlinking to nowhere.
+
         :param content: The scraped HTML
         :return: Cleaner HTML
         """
-        tree = strip_bad_html_tags_insecure(str(content), remove_scripts=True)
-        for removal_class in ["tmp-citationizer", "footer"]:
+        content = content.decode("ISO-8859-1")
+        tree = strip_bad_html_tags_insecure(content, remove_scripts=True)
+        for removal_class in ["tmp-citationizer", "footer", "published-info"]:
             for element in tree.xpath(f"//div[@class='{removal_class}']"):
                 parent = element.getparent()
                 if parent is not None:
                     parent.remove(element)
 
+        for a_tag in tree.xpath("//a"):
+            span = html.Element("span")
+            span.text = a_tag.text
+            a_tag.getparent().replace(a_tag, span)
+
         opinions_navigation = tree.xpath("//div[@id='opinons-navigation']")
         if opinions_navigation:
             opinions_navigation = opinions_navigation[0]
@@ -70,6 +80,6 @@ def cleanup_content(content):
 
         # Find the core element with id 'oscn-content'
         core_element = tree.xpath("//*[@id='oscn-content']")[0]
-        return html.tostring(
-            core_element, pretty_print=True, encoding="unicode"
-        ).encode("utf-8")
+        html_content = html.tostring(core_element).decode("ISO-8859-1")
+
+        return html_content.strip()
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 from setuptools import find_packages, setup
 from setuptools.command.install import install
 
-VERSION = "2.6.34"
+VERSION = "2.6.35"
 AUTHOR = "Free Law Project"
 EMAIL = "[email protected]"
 HERE = os.path.abspath(os.path.dirname(__file__))