-
-
Notifications
You must be signed in to change notification settings - Fork 110
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix(utah, utahctapp): update to OpinionSiteLinear
Solves #1220 - Site had changed; updated HTML selectors - make utahctapp inherit from utah
- Loading branch information
Showing
6 changed files
with
2,202 additions
and
1,757 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,70 +1,42 @@ | ||
from datetime import datetime | ||
import re | ||
|
||
from juriscraper.OpinionSite import OpinionSite | ||
from juriscraper.AbstractSite import logger | ||
from juriscraper.OpinionSiteLinear import OpinionSiteLinear | ||
|
||
|
||
class Site(OpinionSite): | ||
class Site(OpinionSiteLinear): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.url = "http://www.utcourts.gov/opinions/supopin/index.htm" | ||
self.url = "https://legacy.utcourts.gov/opinions/supopin/" | ||
self.court_id = self.__module__ | ||
self.status = "Published" | ||
|
||
def _get_case_names(self): | ||
return [ | ||
name | ||
for name in self.html.xpath( | ||
'/html/body//div[@id="content"]//p[a[@class="bodylink"]]/a/text()' | ||
) | ||
] | ||
|
||
def _get_download_urls(self): | ||
return [ | ||
t | ||
for t in self.html.xpath( | ||
'/html/body//div[@id="content"]//p[a[@class="bodylink"]]/a/@href' | ||
) | ||
] | ||
|
||
def _get_docket_numbers(self): | ||
docket_numbers = [] | ||
for text in self.html.xpath( | ||
'/html/body//div[@id="content"]//p[a[@class="bodylink"]]/text()' | ||
def _process_html(self): | ||
for row in self.html.xpath( | ||
"//div[@id='content']//p[a[contains(@href, '.pdf')]]" | ||
): | ||
try: | ||
parts = text.strip().split(", ") | ||
docket_numbers.append(parts[1]) | ||
except IndexError: | ||
# Happens in whitespace-only text nodes. | ||
if row.xpath("br"): | ||
# Superseded opinions | ||
logger.info("Skipping row %s", row.text_content()) | ||
continue | ||
return docket_numbers | ||
|
||
def _get_case_dates(self): | ||
dates = [] | ||
for text in self.html.xpath( | ||
'/html/body//div[@id="content"]//p[a[@class="bodylink"]]/text()' | ||
): | ||
parts = text.strip().split(", ") | ||
try: | ||
caseDate = f"{parts[-3]}, {parts[-2]}" | ||
dates.append(datetime.strptime(caseDate, "Filed %B %d, %Y")) | ||
except IndexError: | ||
# Happens in whitespace-only text nodes. | ||
continue | ||
return dates | ||
|
||
def _get_precedential_statuses(self): | ||
return ["Published"] * len(self.case_names) | ||
|
||
def _get_citations(self): | ||
neutral_citations = [] | ||
for text in self.html.xpath( | ||
'/html/body//div[@id="content"]//p[a[@class="bodylink"]]/text()' | ||
): | ||
try: | ||
parts = text.strip().split(", ") | ||
if parts[-1]: | ||
neutral_citations.append(parts[-1]) | ||
except IndexError: | ||
# Happens in whitespace-only text nodes. | ||
continue | ||
return neutral_citations | ||
# pick longest text; if not, HTML comments may cause wrong indexing | ||
text = sorted(row.xpath("text()"))[-1] | ||
neutral_cite_match = re.search(r"\d{4} UT( App)? \d{1,}", text) | ||
citation = neutral_cite_match.group(0) | ||
|
||
filed_index = text.find("Filed") | ||
docket = text[:filed_index].strip(", ") | ||
date_filed = text[ | ||
filed_index + 5 : neutral_cite_match.start() | ||
].strip(" ,") | ||
|
||
self.cases.append( | ||
{ | ||
"url": row.xpath("a")[0].get("href"), | ||
"name": row.xpath("a")[0].text_content(), | ||
"date": date_filed, | ||
"citation": citation, | ||
"docket": docket, | ||
} | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,37 +1,8 @@ | ||
import re | ||
from urllib.parse import quote | ||
from juriscraper.opinions.united_states.state import utah | ||
|
||
from juriscraper.OpinionSiteLinear import OpinionSiteLinear | ||
|
||
|
||
class Site(OpinionSiteLinear): | ||
class Site(utah.Site): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.url = "https://www.utcourts.gov/opinions/appopin/index.htm" | ||
self.url = "https://legacy.utcourts.gov/opinions/appopin/" | ||
self.court_id = self.__module__ | ||
self.regex = r"Case No. (.*?), Filed (.*?), (\d{4} UT App \d+)" | ||
|
||
def _process_html(self) -> None: | ||
for row in self.html.xpath("//a[@class='pdf']/parent::p"): | ||
link = row.xpath("./a")[0] | ||
x = " ".join(row.xpath(".//text()")).strip() | ||
if "Superseded" in x: | ||
continue | ||
m = re.search(self.regex, x) | ||
if not m: | ||
continue | ||
date = m.groups()[1] | ||
if "Filed" in date: | ||
date = date.replace("Filed", "").strip() | ||
citation = m.groups()[2] | ||
docket_number = m.groups()[0] | ||
self.cases.append( | ||
{ | ||
"date": date, | ||
"name": row.xpath(".//text()")[0], | ||
"citation": citation, | ||
"url": quote(link.attrib["href"], safe=":/"), | ||
"docket": docket_number, | ||
"status": "Published", | ||
} | ||
) |
Oops, something went wrong.