From f33559f5ffbc3ec1b9eec579056866bc508e71c6 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 2 Mar 2023 17:31:02 -0600 Subject: [PATCH] fix(pacer): Added support for parsing district/bankruptcy download confirmation pages --- .../pacer/download_confirmation_page.py | 65 ++++++++++++++----- .../pacer/confirmation_pages/caeb_1.html | 60 +++++++++++++++++ .../pacer/confirmation_pages/caeb_1.json | 8 +++ .../pacer/confirmation_pages/cand_1.html | 48 ++++++++++++++ .../pacer/confirmation_pages/cand_1.json | 8 +++ .../pacer/confirmation_pages/ctd_1.html | 48 ++++++++++++++ .../pacer/confirmation_pages/ctd_1.json | 8 +++ .../pacer/confirmation_pages/txwd_1.html | 48 ++++++++++++++ .../pacer/confirmation_pages/txwd_1.json | 1 + .../pacer/confirmation_pages/vaed_1.html | 48 ++++++++++++++ .../pacer/confirmation_pages/vaed_1.json | 8 +++ tests/network/test_PacerFreeOpinionsTest.py | 28 ++++++++ 12 files changed, 363 insertions(+), 15 deletions(-) create mode 100644 tests/examples/pacer/confirmation_pages/caeb_1.html create mode 100644 tests/examples/pacer/confirmation_pages/caeb_1.json create mode 100644 tests/examples/pacer/confirmation_pages/cand_1.html create mode 100644 tests/examples/pacer/confirmation_pages/cand_1.json create mode 100644 tests/examples/pacer/confirmation_pages/ctd_1.html create mode 100644 tests/examples/pacer/confirmation_pages/ctd_1.json create mode 100644 tests/examples/pacer/confirmation_pages/txwd_1.html create mode 100644 tests/examples/pacer/confirmation_pages/txwd_1.json create mode 100644 tests/examples/pacer/confirmation_pages/vaed_1.html create mode 100644 tests/examples/pacer/confirmation_pages/vaed_1.json diff --git a/juriscraper/pacer/download_confirmation_page.py b/juriscraper/pacer/download_confirmation_page.py index e298df912..d012e26a8 100644 --- a/juriscraper/pacer/download_confirmation_page.py +++ b/juriscraper/pacer/download_confirmation_page.py @@ -4,7 +4,7 @@ from ..lib.log_tools import make_default_logger from ..lib.string_utils import clean_string, convert_date_string, force_unicode from .reports import BaseReport -from .utils import is_pdf, make_docs1_url +from .utils import is_pdf, make_doc1_url, make_docs1_url logger = make_default_logger() @@ -17,6 +17,14 @@ class DownloadConfirmationPage(BaseReport): def __init__(self, court_id, pacer_session=None): super().__init__(court_id, pacer_session) + self.is_appellate = False + if self.court_id[-1].isdigit() or self.court_id in [ + "cadc", + "cafc", + "cavc", + ]: + self.is_appellate = True + def query(self, pacer_doc_id): """Query the "confirmation download page" endpoint and set the results to self.response. @@ -29,8 +37,12 @@ def query(self, pacer_doc_id): self.session is not None ), "session attribute of DownloadConfirmationPage cannot be None." - # Make the NDA document URL - url = make_docs1_url(self.court_id, pacer_doc_id, True) + if self.is_appellate: + # Make the appellate document URL + url = make_docs1_url(self.court_id, pacer_doc_id, True) + else: + # Make the district/bankruptcy document URL + url = make_doc1_url(self.court_id, pacer_doc_id, True) logger.info("Querying the confirmation page endpoint at URL: %s", url) self.response = self.session.get(url) @@ -59,14 +71,13 @@ def data(self): if self.is_valid is False: return {} - document_number = self._get_document_number() - if document_number is None: + if not self._is_a_receipt_page(): # Abort. If we cannot get a document number return a empy dict. # It's not a valid confirmation page. return {} return { - "document_number": document_number, + "document_number": self._get_document_number(), "docket_number": self._get_docket_number(), "cost": self._get_document_cost(), "billable_pages": self._get_billable_pages(), @@ -74,6 +85,21 @@ def data(self): "transaction_date": self._get_transaction_date(), } + def _is_a_receipt_page(self) -> bool: + """Check if this is a valid download confirmation page for a district + bankruptcy or appellate court. + + :return: True if is a valid page, otherwise False. + """ + + try: + transaction_str = self.tree.re_xpath( + '//*[re:match(text(), "Transaction Receipt")]' + )[0] + except IndexError: + return False + return True + def _get_document_number(self) -> Optional[str]: """Get the document number for an item. @@ -107,7 +133,7 @@ def _get_document_cost(self) -> Optional[str]: return None if cost_str: - return cost_str + return clean_string(cost_str) return None def _get_docket_number(self) -> Optional[str]: @@ -117,16 +143,25 @@ def _get_docket_number(self) -> Optional[str]: """ try: - document_and_case_number = self.tree.xpath( - '//strong[contains(., "Document: PDF Document")]' - )[0].text_content() + if self.is_appellate: + document_and_case_number = self.tree.xpath( + '//strong[contains(., "Document: PDF Document")]' + )[0].text_content() + else: + docket_number = self.tree.re_xpath( + '//*[re:match(text(), "Case Number:")]/' + "/ancestor::th[1]/following-sibling::td[1]/font[1]" + )[0].text_content() + except IndexError: return None - regex = r"Case:([^\,]*)" - docket_number = re.findall(regex, document_and_case_number) + if self.is_appellate: + regex = r"Case:([^\,]*)" + docket_number = re.findall(regex, document_and_case_number)[0] + if docket_number: - return clean_string(docket_number[0]) + return clean_string(docket_number) return None def _get_billable_pages(self) -> Optional[str]: @@ -143,7 +178,7 @@ def _get_billable_pages(self) -> Optional[str]: return None if billable_pages_str: - return billable_pages_str + return clean_string(billable_pages_str) return None def _get_document_description(self) -> Optional[str]: @@ -160,7 +195,7 @@ def _get_document_description(self) -> Optional[str]: return None if document_description_str: - return document_description_str + return clean_string(document_description_str) return None def _get_transaction_date(self) -> Optional[str]: diff --git a/tests/examples/pacer/confirmation_pages/caeb_1.html b/tests/examples/pacer/confirmation_pages/caeb_1.html new file mode 100644 index 000000000..be5c357e2 --- /dev/null +++ b/tests/examples/pacer/confirmation_pages/caeb_1.html @@ -0,0 +1,60 @@ + + +CM/ECF LIVE - U.S. Bankruptcy Court:caeb + +
+
+ +

To accept charges shown below, click on the 'View Document' button, otherwise click the 'Back' button on your browser.


PACER Service Center
Transaction Receipt
Thu Mar 2 14:10:07 2023
Pacer Login: jesus13law Client Code:
Description: Image:64-0 Case Number: 20-10691
Billable Pages: 5 Cost: 0.50
\ No newline at end of file diff --git a/tests/examples/pacer/confirmation_pages/caeb_1.json b/tests/examples/pacer/confirmation_pages/caeb_1.json new file mode 100644 index 000000000..e3b38cd89 --- /dev/null +++ b/tests/examples/pacer/confirmation_pages/caeb_1.json @@ -0,0 +1,8 @@ +{ + "billable_pages": "5", + "cost": "0.50", + "docket_number": "20-10691", + "document_description": "Image:64-0", + "document_number": null, + "transaction_date": "2023-03-02T14:10:07" +} \ No newline at end of file diff --git a/tests/examples/pacer/confirmation_pages/cand_1.html b/tests/examples/pacer/confirmation_pages/cand_1.html new file mode 100644 index 000000000..f1e8e2ae2 --- /dev/null +++ b/tests/examples/pacer/confirmation_pages/cand_1.html @@ -0,0 +1,48 @@ + + +CAND-ECF +
+
+

To accept charges shown below, click on the 'View Document' button, otherwise click the 'Back' button on your browser.


PACER Service Center
Transaction Receipt
Thu Mar 2 13:53:33 2023
Pacer Login: jesus13law Client Code:
Description: Image670-0 Case Number: 3:18-cv-04865-EMC
Billable Pages: 1 Cost: 0.10
\ No newline at end of file diff --git a/tests/examples/pacer/confirmation_pages/cand_1.json b/tests/examples/pacer/confirmation_pages/cand_1.json new file mode 100644 index 000000000..74cbe56ad --- /dev/null +++ b/tests/examples/pacer/confirmation_pages/cand_1.json @@ -0,0 +1,8 @@ +{ + "billable_pages": "1", + "cost": "0.10", + "docket_number": "3:18-cv-04865-EMC", + "document_description": "Image670-0", + "document_number": null, + "transaction_date": "2023-03-02T13:53:33" +} \ No newline at end of file diff --git a/tests/examples/pacer/confirmation_pages/ctd_1.html b/tests/examples/pacer/confirmation_pages/ctd_1.html new file mode 100644 index 000000000..33312f8a4 --- /dev/null +++ b/tests/examples/pacer/confirmation_pages/ctd_1.html @@ -0,0 +1,48 @@ + + +CT CMECF NextGen +
+
+

To accept charges shown below, click on the 'View Document' button, otherwise click the 'Back' button on your browser.


PACER Service Center
Transaction Receipt
Thu Mar 2 17:06:29 2023
Pacer Login: jesus13law Client Code:
Description: Image462-0 Case Number: 3:16-cv-01702-JAM
Billable Pages: 3 Cost: 0.30
\ No newline at end of file diff --git a/tests/examples/pacer/confirmation_pages/ctd_1.json b/tests/examples/pacer/confirmation_pages/ctd_1.json new file mode 100644 index 000000000..0c514da76 --- /dev/null +++ b/tests/examples/pacer/confirmation_pages/ctd_1.json @@ -0,0 +1,8 @@ +{ + "billable_pages": "3", + "cost": "0.30", + "docket_number": "3:16-cv-01702-JAM", + "document_description": "Image462-0", + "document_number": null, + "transaction_date": "2023-03-02T17:06:29" +} \ No newline at end of file diff --git a/tests/examples/pacer/confirmation_pages/txwd_1.html b/tests/examples/pacer/confirmation_pages/txwd_1.html new file mode 100644 index 000000000..f34c12198 --- /dev/null +++ b/tests/examples/pacer/confirmation_pages/txwd_1.html @@ -0,0 +1,48 @@ + + +Centralized CM/ECF LIVE - U.S. District Court:txwd +
+
+You do not have permission to view this document.
\ No newline at end of file diff --git a/tests/examples/pacer/confirmation_pages/txwd_1.json b/tests/examples/pacer/confirmation_pages/txwd_1.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/tests/examples/pacer/confirmation_pages/txwd_1.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/tests/examples/pacer/confirmation_pages/vaed_1.html b/tests/examples/pacer/confirmation_pages/vaed_1.html new file mode 100644 index 000000000..d4f412f18 --- /dev/null +++ b/tests/examples/pacer/confirmation_pages/vaed_1.html @@ -0,0 +1,48 @@ + + +CM/ECF - vaed +
+
+

To accept charges shown below, click on the 'View Document' button, otherwise click the 'Back' button on your browser.


PACER Service Center
Transaction Receipt
Thu Mar 2 17:07:51 2023
Pacer Login: jesus13law Client Code:
Description: Image437-0 Case Number: 2:18-cv-00530-MSD-RJK
Billable Pages: 2 Cost: 0.20
\ No newline at end of file diff --git a/tests/examples/pacer/confirmation_pages/vaed_1.json b/tests/examples/pacer/confirmation_pages/vaed_1.json new file mode 100644 index 000000000..013de5d93 --- /dev/null +++ b/tests/examples/pacer/confirmation_pages/vaed_1.json @@ -0,0 +1,8 @@ +{ + "billable_pages": "2", + "cost": "0.20", + "docket_number": "2:18-cv-00530-MSD-RJK", + "document_description": "Image437-0", + "document_number": null, + "transaction_date": "2023-03-02T17:07:51" +} \ No newline at end of file diff --git a/tests/network/test_PacerFreeOpinionsTest.py b/tests/network/test_PacerFreeOpinionsTest.py index 86d1f5797..d5d6f0224 100644 --- a/tests/network/test_PacerFreeOpinionsTest.py +++ b/tests/network/test_PacerFreeOpinionsTest.py @@ -271,10 +271,16 @@ def setUp(self): self.report = DownloadConfirmationPage("ca8", self.session) self.report_att = DownloadConfirmationPage("ca5", self.session) self.report_pdf = DownloadConfirmationPage("ca11", self.session) + self.report_nef_no_confirmation = DownloadConfirmationPage( + "txwd", self.session + ) + self.report_nef = DownloadConfirmationPage("cand", self.session) self.pacer_doc_id = "00812590792" self.no_confirmation_page_pacer_doc_id = "00802251695" self.pacer_doc_id_att = "00506470276" self.pacer_doc_id_pdf = "011012534985" + self.pacer_doc_id_nef_no_confirmation = "181027895860" + self.pacer_doc_id_nef = "035022812318" @SKIP_IF_NO_PACER_LOGIN def test_get_document_number(self): @@ -315,3 +321,25 @@ def test_no_confirmation_page_pdf_returned(self): self.report_pdf.query(self.pacer_doc_id_pdf) data_report = self.report_pdf.data self.assertEqual(data_report, {}) + + @SKIP_IF_NO_PACER_LOGIN + def test_confirmation_page_pdf_district(self): + """Can we get the PACER document number from a district download + confirmation page?""" + self.report_nef.query(self.pacer_doc_id_nef) + data_report = self.report_nef.data + self.assertEqual(data_report["document_number"], None) + self.assertEqual(data_report["docket_number"], "3:18-cv-04865-EMC") + self.assertEqual(data_report["cost"], "0.10") + self.assertEqual(data_report["billable_pages"], "1") + self.assertEqual(data_report["document_description"], "Image670-0") + + @SKIP_IF_NO_PACER_LOGIN + def test_no_confirmation_page_pdf_returned_district(self): + """If the district download confirmation page is not available an empty + dictionary is returned""" + self.report_nef_no_confirmation.query( + self.pacer_doc_id_nef_no_confirmation + ) + data_report = self.report_nef_no_confirmation.data + self.assertEqual(data_report, {})