Skip to content

Commit

Permalink
Merge pull request #1224 from freelawproject/uscfc_new_site
Browse files Browse the repository at this point in the history
fix(uscfc): implement new site
  • Loading branch information
flooie authored Nov 7, 2024
2 parents ed44c1e + a044dde commit 22226ee
Show file tree
Hide file tree
Showing 18 changed files with 74,635 additions and 5,397 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@
"nmcca",
"tax",
"uscfc",
"uscfc_u",
"uscfc_vaccine",
"uscfc_vaccine_u",
"fisc",
"fiscr",
]
185 changes: 67 additions & 118 deletions juriscraper/opinions/united_states/federal_special/uscfc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,137 +4,86 @@
Notes:
Scraper adapted for new website as of February 20, 2014.
2024-10-23, grossir: implemented new site
"""

import datetime
import json
import re

from lxml import html
from juriscraper.lib.string_utils import titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear

from juriscraper.lib.exceptions import InsanityException
from juriscraper.lib.string_utils import (
clean_if_py3,
convert_date_string,
titlecase,
)
from juriscraper.OpinionSite import OpinionSite

class Site(OpinionSiteLinear):
judge_regex = re.compile(r"Signed by[\w\s]+(Master|Judge)(?P<judge>.+?)\(")
other_date_regex = re.compile(r"\([Oo]riginally filed:?[\d\s/]+\)")

class Site(OpinionSite):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "http://www.uscfc.uscourts.gov/aggregator/sources/8"
self.back_scrape_iterable = list(range(1, 4))
self.url = "https://ecf.cofc.uscourts.gov/cgi-bin/CFC_RecentOpinionsOfTheCourt.pl"
self.court_id = self.__module__
self.today = datetime.datetime.now()
self.is_vaccine = "uscfc_vaccine" in self.court_id

def _process_html(self):
"""The site returns a page with all opinions for this time period
The opinions are inside a <script> tag, as a Javascript constant
that will be parsed using json.loads
"""
judges_mapper = {
option.get("value"): option.text_content()
for option in self.html.xpath("//select[@name='judge']//option")
}
judges_mapper.pop("UNKNOWN", "")
judges_mapper.pop("all", "")

raw_data = (
self.html.xpath("//script")[0]
.text_content()
.strip()
.strip("; ")
.split("= ", 1)[1]
)

def _download(self, request_dict={}):
if self.test_mode_enabled():
# Use static 'today' date for consisting test results
self.today = convert_date_string("2018/10/17")
return super()._download(request_dict)
for opinion in json.loads(raw_data):
docket, name = opinion["title"].split(" &bull; ", 1)

def _get_case_dates(self):
dates = []
for item in self.html.xpath('//span[@class="feed-item-date"]'):
text = item.text_content().strip()
words = text.split()
if len(words) == 2:
date = convert_date_string(words[1])
elif "ago" in text:
# The record was added today "X hours and Y min ago"
date = self.today
summary = opinion["text"]
if judge_match := self.judge_regex.search(summary):
judge = judge_match.group("judge").strip(" .()")
# Remove: "Signed by ... . Service on parties made"
summary = summary[: judge_match.start()].strip(", .()")
else:
raise InsanityException(
f"Unrecognized date element string: {text}"
)
dates.append(date)
return dates

def _get_case_names(self):
case_names = []
for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'):
t = " ".join(clean_if_py3(t).split()) # Normalize whitespace
if t.strip():
# If there is something other than whitespace...
if not isinstance(t, str):
t = str(t, encoding="utf-8")

if " • " in t:
t = t.split(" • ")[1].strip()
t = titlecase(t.lower())
case_names.append(t)
return case_names

def _get_download_urls(self):
path = '//h3[@class="feed-item-title"]/a/@href'
return list(self.html.xpath(path))

def _get_precedential_statuses(self):
return ["Published"] * len(self.case_names)

def _get_docket_numbers(self):
docket_numbers = []
for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'):
t = clean_if_py3(t)
if t.strip():
# If there is something other than whitespace...
if not isinstance(t, str):
t = str(t, encoding="utf-8")
judge = judges_mapper.get(opinion["judge"], "")

if " • " in t:
t = t.split(" • ")[0].strip()
docket_numbers.append(t)
return docket_numbers
other_date = ""
if other_date_match := self.other_date_regex.search(summary):
other_date = other_date_match.group(0).strip("() ")
summary = re.sub(self.other_date_regex, "", summary)

def _get_summaries(self):
summaries = []
path = '//div[@class="feed-item-body"]'
for e in self.html.xpath(path):
s = html.tostring(e, method="text", encoding="unicode")
s = clean_if_py3(s).split("Keywords:")[0]
summaries.append(s)

return summaries

def _get_judges(self):
path = '//div[@class="feed-item-body"]'
judges = []
splitters = [
"Signed by Chief Judge",
"Signed by Judge",
"Signed by Chief Special Master", # Vaccine courts have odd names for judges
"Signed by Special Master",
]
for e in self.html.xpath(path):
t = html.tostring(e, method="text", encoding="unicode")
t = clean_if_py3(t).split("Keywords:")[0]
for splitter in splitters:
judge_parts = t.rsplit(splitter)
if len(judge_parts) == 1:
# No splits found...
judge = ""
continue
else:
judge = judge_parts[1]
break

# Often the text looks like: 'Judge Susan G. Braden. (jt1) Copy to parties.' In that case we only
# want the name, not the rest.
length_of_match = 2
m = re.search(
r"[a-z]{%s}\." % length_of_match, judge
) # Two lower case letters followed by a period
if m:
judge = judge[: m.start() + length_of_match]
if opinion["criteria"] == "unreported":
status = "Unpublished"
elif opinion["criteria"] == "reported":
status = "Published"
else:
judge = ""
judge.strip(".")
judges.append(judge)
return judges

def _download_backwards(self, page):
self.url = (
f"http://www.uscfc.uscourts.gov/aggregator/sources/8?page={page}"
)
self.html = self._download()
status = "Unknown"

parsed_case = {
"url": opinion["link"],
"date": opinion["date"],
"other_date": other_date,
"status": status,
"summary": summary,
"judge": judge,
"name": titlecase(name),
"docket": docket,
}

# Append a "V" as seen in the opinions PDF for the vaccine
# claims. This will help disambiguation, in case docket
# numbers collide
if self.is_vaccine:
if not docket.lower().endswith("v"):
yy, number = docket.split("-")
parsed_case["docket"] = f"{yy}-{number.zfill(4)}V"

self.cases.append(parsed_case)
22 changes: 0 additions & 22 deletions juriscraper/opinions/united_states/federal_special/uscfc_u.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,18 @@
class Site(uscfc.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "http://www.uscfc.uscourts.gov/aggregator/sources/7"
self.court_id = self.__module__
self.back_scrape_iterable = [1]
self.url = "https://ecf.cofc.uscourts.gov/cgi-bin/CFC_RecentDecisionsOfTheSpecialMasters.pl"

def _download_backwards(self, page):
self.url = (
f"http://www.uscfc.uscourts.gov/aggregator/sources/7?page={page}"
)
self.html = self._download()
def extract_from_text(self, scraped_text: str) -> dict:
"""Extract 'status' from text, if possible
On the first page of the opinion, after the parties and attorneys names
the decision title may point to it being published.
The scraped site itself marks all `uscfc_vaccine` opinions as
unreported
"""
if "PUBLISHED DECISION" in scraped_text[:1500]:
return {"OpinionCluster": {"precedential_status": "Published"}}

return {}

This file was deleted.

Loading

0 comments on commit 22226ee

Please sign in to comment.