Skip to content

Commit

Permalink
fix(utah, utahctapp): update to OpinionSiteLinear
Browse files Browse the repository at this point in the history
Solves #1220

- Site had changed; updated HTML selectors
- make utahctapp inherit from utah
  • Loading branch information
grossir committed Nov 11, 2024
1 parent 0644f6f commit dfc5be8
Show file tree
Hide file tree
Showing 6 changed files with 2,202 additions and 1,757 deletions.
92 changes: 32 additions & 60 deletions juriscraper/opinions/united_states/state/utah.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,42 @@
from datetime import datetime
import re

from juriscraper.OpinionSite import OpinionSite
from juriscraper.AbstractSite import logger
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSite):
class Site(OpinionSiteLinear):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "http://www.utcourts.gov/opinions/supopin/index.htm"
self.url = "https://legacy.utcourts.gov/opinions/supopin/"
self.court_id = self.__module__
self.status = "Published"

def _get_case_names(self):
return [
name
for name in self.html.xpath(
'/html/body//div[@id="content"]//p[a[@class="bodylink"]]/a/text()'
)
]

def _get_download_urls(self):
return [
t
for t in self.html.xpath(
'/html/body//div[@id="content"]//p[a[@class="bodylink"]]/a/@href'
)
]

def _get_docket_numbers(self):
docket_numbers = []
for text in self.html.xpath(
'/html/body//div[@id="content"]//p[a[@class="bodylink"]]/text()'
def _process_html(self):
for row in self.html.xpath(
"//div[@id='content']//p[a[contains(@href, '.pdf')]]"
):
try:
parts = text.strip().split(", ")
docket_numbers.append(parts[1])
except IndexError:
# Happens in whitespace-only text nodes.
if row.xpath("br"):
# Superseded opinions
logger.info("Skipping row %s", row.text_content())
continue
return docket_numbers

def _get_case_dates(self):
dates = []
for text in self.html.xpath(
'/html/body//div[@id="content"]//p[a[@class="bodylink"]]/text()'
):
parts = text.strip().split(", ")
try:
caseDate = f"{parts[-3]}, {parts[-2]}"
dates.append(datetime.strptime(caseDate, "Filed %B %d, %Y"))
except IndexError:
# Happens in whitespace-only text nodes.
continue
return dates

def _get_precedential_statuses(self):
return ["Published"] * len(self.case_names)

def _get_citations(self):
neutral_citations = []
for text in self.html.xpath(
'/html/body//div[@id="content"]//p[a[@class="bodylink"]]/text()'
):
try:
parts = text.strip().split(", ")
if parts[-1]:
neutral_citations.append(parts[-1])
except IndexError:
# Happens in whitespace-only text nodes.
continue
return neutral_citations
# pick longest text; if not, HTML comments may cause wrong indexing
text = sorted(row.xpath("text()"))[-1]
neutral_cite_match = re.search(r"\d{4} UT( App)? \d{1,}", text)
citation = neutral_cite_match.group(0)

filed_index = text.find("Filed")
docket = text[:filed_index].strip(", ")
date_filed = text[
filed_index + 5 : neutral_cite_match.start()
].strip(" ,")

self.cases.append(
{
"url": row.xpath("a")[0].get("href"),
"name": row.xpath("a")[0].text_content(),
"date": date_filed,
"citation": citation,
"docket": docket,
}
)
35 changes: 3 additions & 32 deletions juriscraper/opinions/united_states/state/utahctapp.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,8 @@
import re
from urllib.parse import quote
from juriscraper.opinions.united_states.state import utah

from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
class Site(utah.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://www.utcourts.gov/opinions/appopin/index.htm"
self.url = "https://legacy.utcourts.gov/opinions/appopin/"
self.court_id = self.__module__
self.regex = r"Case No. (.*?), Filed (.*?), (\d{4} UT App \d+)"

def _process_html(self) -> None:
for row in self.html.xpath("//a[@class='pdf']/parent::p"):
link = row.xpath("./a")[0]
x = " ".join(row.xpath(".//text()")).strip()
if "Superseded" in x:
continue
m = re.search(self.regex, x)
if not m:
continue
date = m.groups()[1]
if "Filed" in date:
date = date.replace("Filed", "").strip()
citation = m.groups()[2]
docket_number = m.groups()[0]
self.cases.append(
{
"date": date,
"name": row.xpath(".//text()")[0],
"citation": citation,
"url": quote(link.attrib["href"], safe=":/"),
"docket": docket_number,
"status": "Published",
}
)
Loading

0 comments on commit dfc5be8

Please sign in to comment.