fix(us_backscrapers.mass): fix extract from text

Solves #1234 fixed indexing errors in extract_from_text
freelawproject · Nov 12, 2024 · e2277d1 · e2277d1
1 parent 0644f6f
commit e2277d1
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 22 deletions.
diff --git a/juriscraper/opinions/united_states_backscrapers/state/mass.py b/juriscraper/opinions/united_states_backscrapers/state/mass.py
@@ -6,7 +6,7 @@
 from dateutil import parser
 from lxml.html import fromstring
 
-from juriscraper.lib.string_utils import clean_string
+from juriscraper.lib.string_utils import clean_string, titlecase
 from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
@@ -41,9 +41,6 @@ class Site(OpinionSiteLinear):
             "url": "http://masscases.com/275-299.html",
         },
     ]
-    # on the cl_scrape_opinions command in Courtlistener,
-    # a headers variable is required for mass
-    headers = {}
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -78,7 +75,7 @@ def _process_html(self) -> None:
                 {
                     "citation": cite,
                     "date": date_filed_str,
-                    "name": name,
+                    "name": titlecase(name),
                     "url": url,
                     "docket": "",
                     "status": "Published",
@@ -96,32 +93,35 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
 
         The format on App Ct opinions is different
         """
-        match = re.search(self.docket_number_regex, scraped_text[:2000])
-        docket = match.group(0) if match else ""
+        metadata = {"OpinionCluster": {}}
+        if match := re.search(self.docket_number_regex, scraped_text[:2000]):
+            docket = match.group(0)
+            metadata["Docket"] = {"docket_number": docket}
 
-        headnotes, summary = "", ""
         html = fromstring(scraped_text)
         headnote_section = html.xpath("//section[@class='headnote']")
         synopsis_section = html.xpath("//section[@class='synopsis']")
 
         if headnote_section:
+            headnotes = clean_string(headnote_section[0].xpath("string()"))
             # First line of the headnote might be the docket number
-            headnotes = clean_string(
-                headnote_section.xpath("string()")[0].replace(docket, "")
-            )
+            if metadata.get("Docket"):
+                headnotes = (
+                    headnotes.replace(docket, "").replace("No. . ", "").strip()
+                )
+            metadata["OpinionCluster"]["headnotes"] = headnotes
+
         if synopsis_section:
             summary = "\n".join(
                 [
                     clean_string(p.xpath("string()"))
                     # avoid page numbers
-                    for p in synopsis_section.xpath(".//p[not(@class)]")
+                    for p in synopsis_section[0].xpath(".//p[not(@class)]")
                 ]
             )
+            metadata["OpinionCluster"]["summary"] = summary
 
-        return {
-            "Docket": {"docket_number": docket},
-            "OpinionCluster": {"headnotes": headnotes, "summary": summary},
-        }
+        return metadata
 
     def _download_backwards(
         self, dates_and_url: Tuple[date, date, str]
@@ -148,11 +148,11 @@ def make_backscrape_iterable(self, kwargs: dict) -> None:
         now = datetime.now()
 
         if start:
-            start = datetime.strptime(start, "%m/%d/%Y")
+            start = datetime.strptime(start, "%Y/%m/%d")
         else:
             start = self.first_opinion_date
         if end:
-            end = datetime.strptime(end, "%m/%d/%Y")
+            end = datetime.strptime(end, "%Y/%m/%d")
         else:
             end = now
 

diff --git a/juriscraper/opinions/united_states_backscrapers/state/massappct.py b/juriscraper/opinions/united_states_backscrapers/state/massappct.py
@@ -17,7 +17,3 @@ class Site(mass.Site):
             "url": "http://masscases.com/app75-99.html",
         },
     ]
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.court_id = self.__module__