fix(utah, utahctapp): update to OpinionSiteLinear

Solves #1220 - Site had changed; updated HTML selectors - make utahctapp inherit from utah
freelawproject · Nov 11, 2024 · dfc5be8 · dfc5be8
1 parent 0644f6f
commit dfc5be8
Show file tree

Hide file tree

Showing 6 changed files with 2,202 additions and 1,757 deletions.
diff --git a/juriscraper/opinions/united_states/state/utah.py b/juriscraper/opinions/united_states/state/utah.py
@@ -1,70 +1,42 @@
-from datetime import datetime
+import re
 
-from juriscraper.OpinionSite import OpinionSite
+from juriscraper.AbstractSite import logger
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
-class Site(OpinionSite):
+class Site(OpinionSiteLinear):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.url = "http://www.utcourts.gov/opinions/supopin/index.htm"
+        self.url = "https://legacy.utcourts.gov/opinions/supopin/"
         self.court_id = self.__module__
+        self.status = "Published"
 
-    def _get_case_names(self):
-        return [
-            name
-            for name in self.html.xpath(
-                '/html/body//div[@id="content"]//p[a[@class="bodylink"]]/a/text()'
-            )
-        ]
-
-    def _get_download_urls(self):
-        return [
-            t
-            for t in self.html.xpath(
-                '/html/body//div[@id="content"]//p[a[@class="bodylink"]]/a/@href'
-            )
-        ]
-
-    def _get_docket_numbers(self):
-        docket_numbers = []
-        for text in self.html.xpath(
-            '/html/body//div[@id="content"]//p[a[@class="bodylink"]]/text()'
+    def _process_html(self):
+        for row in self.html.xpath(
+            "//div[@id='content']//p[a[contains(@href, '.pdf')]]"
         ):
-            try:
-                parts = text.strip().split(", ")
-                docket_numbers.append(parts[1])
-            except IndexError:
-                # Happens in whitespace-only text nodes.
+            if row.xpath("br"):
+                # Superseded opinions
+                logger.info("Skipping row %s", row.text_content())
                 continue
-        return docket_numbers
 
-    def _get_case_dates(self):
-        dates = []
-        for text in self.html.xpath(
-            '/html/body//div[@id="content"]//p[a[@class="bodylink"]]/text()'
-        ):
-            parts = text.strip().split(", ")
-            try:
-                caseDate = f"{parts[-3]}, {parts[-2]}"
-                dates.append(datetime.strptime(caseDate, "Filed %B %d, %Y"))
-            except IndexError:
-                # Happens in whitespace-only text nodes.
-                continue
-        return dates
-
-    def _get_precedential_statuses(self):
-        return ["Published"] * len(self.case_names)
-
-    def _get_citations(self):
-        neutral_citations = []
-        for text in self.html.xpath(
-            '/html/body//div[@id="content"]//p[a[@class="bodylink"]]/text()'
-        ):
-            try:
-                parts = text.strip().split(", ")
-                if parts[-1]:
-                    neutral_citations.append(parts[-1])
-            except IndexError:
-                # Happens in whitespace-only text nodes.
-                continue
-        return neutral_citations
+            # pick longest text; if not, HTML comments may cause wrong indexing
+            text = sorted(row.xpath("text()"))[-1]
+            neutral_cite_match = re.search(r"\d{4} UT( App)? \d{1,}", text)
+            citation = neutral_cite_match.group(0)
+
+            filed_index = text.find("Filed")
+            docket = text[:filed_index].strip(", ")
+            date_filed = text[
+                filed_index + 5 : neutral_cite_match.start()
+            ].strip(" ,")
+
+            self.cases.append(
+                {
+                    "url": row.xpath("a")[0].get("href"),
+                    "name": row.xpath("a")[0].text_content(),
+                    "date": date_filed,
+                    "citation": citation,
+                    "docket": docket,
+                }
+            )
diff --git a/juriscraper/opinions/united_states/state/utahctapp.py b/juriscraper/opinions/united_states/state/utahctapp.py
@@ -1,37 +1,8 @@
-import re
-from urllib.parse import quote
+from juriscraper.opinions.united_states.state import utah
 
-from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
-
-class Site(OpinionSiteLinear):
+class Site(utah.Site):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.url = "https://www.utcourts.gov/opinions/appopin/index.htm"
+        self.url = "https://legacy.utcourts.gov/opinions/appopin/"
         self.court_id = self.__module__
-        self.regex = r"Case No. (.*?), Filed (.*?), (\d{4} UT App \d+)"
-
-    def _process_html(self) -> None:
-        for row in self.html.xpath("//a[@class='pdf']/parent::p"):
-            link = row.xpath("./a")[0]
-            x = " ".join(row.xpath(".//text()")).strip()
-            if "Superseded" in x:
-                continue
-            m = re.search(self.regex, x)
-            if not m:
-                continue
-            date = m.groups()[1]
-            if "Filed" in date:
-                date = date.replace("Filed", "").strip()
-            citation = m.groups()[2]
-            docket_number = m.groups()[0]
-            self.cases.append(
-                {
-                    "date": date,
-                    "name": row.xpath(".//text()")[0],
-                    "citation": citation,
-                    "url": quote(link.attrib["href"], safe=":/"),
-                    "docket": docket_number,
-                    "status": "Published",
-                }
-            )