Merge pull request #1224 from freelawproject/uscfc_new_site

fix(uscfc): implement new site
freelawproject · Nov 7, 2024 · 22226ee · 22226ee
2 parents ed44c1e + a044dde
commit 22226ee
Show file tree

Hide file tree

Showing 18 changed files with 74,635 additions and 5,397 deletions.
diff --git a/juriscraper/opinions/united_states/federal_special/__init__.py b/juriscraper/opinions/united_states/federal_special/__init__.py
@@ -11,9 +11,7 @@
     "nmcca",
     "tax",
     "uscfc",
-    "uscfc_u",
     "uscfc_vaccine",
-    "uscfc_vaccine_u",
     "fisc",
     "fiscr",
 ]
diff --git a/juriscraper/opinions/united_states/federal_special/uscfc.py b/juriscraper/opinions/united_states/federal_special/uscfc.py
@@ -4,137 +4,86 @@
 
 Notes:
     Scraper adapted for new website as of February 20, 2014.
+    2024-10-23, grossir: implemented new site
 """
 
-import datetime
+import json
 import re
 
-from lxml import html
+from juriscraper.lib.string_utils import titlecase
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
-from juriscraper.lib.exceptions import InsanityException
-from juriscraper.lib.string_utils import (
-    clean_if_py3,
-    convert_date_string,
-    titlecase,
-)
-from juriscraper.OpinionSite import OpinionSite
 
+class Site(OpinionSiteLinear):
+    judge_regex = re.compile(r"Signed by[\w\s]+(Master|Judge)(?P<judge>.+?)\(")
+    other_date_regex = re.compile(r"\([Oo]riginally filed:?[\d\s/]+\)")
 
-class Site(OpinionSite):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.url = "http://www.uscfc.uscourts.gov/aggregator/sources/8"
-        self.back_scrape_iterable = list(range(1, 4))
+        self.url = "https://ecf.cofc.uscourts.gov/cgi-bin/CFC_RecentOpinionsOfTheCourt.pl"
         self.court_id = self.__module__
-        self.today = datetime.datetime.now()
+        self.is_vaccine = "uscfc_vaccine" in self.court_id
+
+    def _process_html(self):
+        """The site returns a page with all opinions for this time period
+        The opinions are inside a <script> tag, as a Javascript constant
+        that will be parsed using json.loads
+        """
+        judges_mapper = {
+            option.get("value"): option.text_content()
+            for option in self.html.xpath("//select[@name='judge']//option")
+        }
+        judges_mapper.pop("UNKNOWN", "")
+        judges_mapper.pop("all", "")
+
+        raw_data = (
+            self.html.xpath("//script")[0]
+            .text_content()
+            .strip()
+            .strip("; ")
+            .split("= ", 1)[1]
+        )
 
-    def _download(self, request_dict={}):
-        if self.test_mode_enabled():
-            # Use static 'today' date for consisting test results
-            self.today = convert_date_string("2018/10/17")
-        return super()._download(request_dict)
+        for opinion in json.loads(raw_data):
+            docket, name = opinion["title"].split(" &bull; ", 1)
 
-    def _get_case_dates(self):
-        dates = []
-        for item in self.html.xpath('//span[@class="feed-item-date"]'):
-            text = item.text_content().strip()
-            words = text.split()
-            if len(words) == 2:
-                date = convert_date_string(words[1])
-            elif "ago" in text:
-                # The record was added today "X hours and Y min ago"
-                date = self.today
+            summary = opinion["text"]
+            if judge_match := self.judge_regex.search(summary):
+                judge = judge_match.group("judge").strip(" .()")
+                # Remove: "Signed by ... . Service on parties made"
+                summary = summary[: judge_match.start()].strip(", .()")
             else:
-                raise InsanityException(
-                    f"Unrecognized date element string: {text}"
-                )
-            dates.append(date)
-        return dates
-
-    def _get_case_names(self):
-        case_names = []
-        for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'):
-            t = " ".join(clean_if_py3(t).split())  # Normalize whitespace
-            if t.strip():
-                # If there is something other than whitespace...
-                if not isinstance(t, str):
-                    t = str(t, encoding="utf-8")
-
-                if " • " in t:
-                    t = t.split(" • ")[1].strip()
-                t = titlecase(t.lower())
-                case_names.append(t)
-        return case_names
-
-    def _get_download_urls(self):
-        path = '//h3[@class="feed-item-title"]/a/@href'
-        return list(self.html.xpath(path))
-
-    def _get_precedential_statuses(self):
-        return ["Published"] * len(self.case_names)
-
-    def _get_docket_numbers(self):
-        docket_numbers = []
-        for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'):
-            t = clean_if_py3(t)
-            if t.strip():
-                # If there is something other than whitespace...
-                if not isinstance(t, str):
-                    t = str(t, encoding="utf-8")
+                judge = judges_mapper.get(opinion["judge"], "")
 
-                if " • " in t:
-                    t = t.split(" • ")[0].strip()
-                docket_numbers.append(t)
-        return docket_numbers
+            other_date = ""
+            if other_date_match := self.other_date_regex.search(summary):
+                other_date = other_date_match.group(0).strip("() ")
+                summary = re.sub(self.other_date_regex, "", summary)
 
-    def _get_summaries(self):
-        summaries = []
-        path = '//div[@class="feed-item-body"]'
-        for e in self.html.xpath(path):
-            s = html.tostring(e, method="text", encoding="unicode")
-            s = clean_if_py3(s).split("Keywords:")[0]
-            summaries.append(s)
-
-        return summaries
-
-    def _get_judges(self):
-        path = '//div[@class="feed-item-body"]'
-        judges = []
-        splitters = [
-            "Signed by Chief Judge",
-            "Signed by Judge",
-            "Signed by Chief Special Master",  # Vaccine courts have odd names for judges
-            "Signed by Special Master",
-        ]
-        for e in self.html.xpath(path):
-            t = html.tostring(e, method="text", encoding="unicode")
-            t = clean_if_py3(t).split("Keywords:")[0]
-            for splitter in splitters:
-                judge_parts = t.rsplit(splitter)
-                if len(judge_parts) == 1:
-                    # No splits found...
-                    judge = ""
-                    continue
-                else:
-                    judge = judge_parts[1]
-                    break
-
-            # Often the text looks like: 'Judge Susan G. Braden. (jt1) Copy to parties.' In that case we only
-            # want the name, not the rest.
-            length_of_match = 2
-            m = re.search(
-                r"[a-z]{%s}\." % length_of_match, judge
-            )  # Two lower case letters followed by a period
-            if m:
-                judge = judge[: m.start() + length_of_match]
+            if opinion["criteria"] == "unreported":
+                status = "Unpublished"
+            elif opinion["criteria"] == "reported":
+                status = "Published"
             else:
-                judge = ""
-            judge.strip(".")
-            judges.append(judge)
-        return judges
-
-    def _download_backwards(self, page):
-        self.url = (
-            f"http://www.uscfc.uscourts.gov/aggregator/sources/8?page={page}"
-        )
-        self.html = self._download()
+                status = "Unknown"
+
+            parsed_case = {
+                "url": opinion["link"],
+                "date": opinion["date"],
+                "other_date": other_date,
+                "status": status,
+                "summary": summary,
+                "judge": judge,
+                "name": titlecase(name),
+                "docket": docket,
+            }
+
+            # Append a "V" as seen in the opinions PDF for the vaccine
+            # claims. This will help disambiguation, in case docket
+            # numbers collide
+            if self.is_vaccine:
+                if not docket.lower().endswith("v"):
+                    yy, number = docket.split("-")
+                    parsed_case["docket"] = f"{yy}-{number.zfill(4)}V"
+
+            self.cases.append(parsed_case)
diff --git a/juriscraper/opinions/united_states/federal_special/uscfc_u.py b/juriscraper/opinions/united_states/federal_special/uscfc_u.py
diff --git a/juriscraper/opinions/united_states/federal_special/uscfc_vaccine.py b/juriscraper/opinions/united_states/federal_special/uscfc_vaccine.py
@@ -8,12 +8,18 @@
 class Site(uscfc.Site):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.url = "http://www.uscfc.uscourts.gov/aggregator/sources/7"
-        self.court_id = self.__module__
-        self.back_scrape_iterable = [1]
+        self.url = "https://ecf.cofc.uscourts.gov/cgi-bin/CFC_RecentDecisionsOfTheSpecialMasters.pl"
 
-    def _download_backwards(self, page):
-        self.url = (
-            f"http://www.uscfc.uscourts.gov/aggregator/sources/7?page={page}"
-        )
-        self.html = self._download()
+    def extract_from_text(self, scraped_text: str) -> dict:
+        """Extract 'status' from text, if possible
+
+        On the first page of the opinion, after the parties and attorneys names
+        the decision title may point to it being published.
+
+        The scraped site itself marks all `uscfc_vaccine` opinions as
+        unreported
+        """
+        if "PUBLISHED DECISION" in scraped_text[:1500]:
+            return {"OpinionCluster": {"precedential_status": "Published"}}
+
+        return {}
diff --git a/juriscraper/opinions/united_states/federal_special/uscfc_vaccine_u.py b/juriscraper/opinions/united_states/federal_special/uscfc_vaccine_u.py