Skip to content

Commit

Permalink
fix(pacer): Added support for parsing district/bankruptcy download co…
Browse files Browse the repository at this point in the history
…nfirmation pages
  • Loading branch information
albertisfu committed Mar 2, 2023
1 parent 9a4328e commit f33559f
Show file tree
Hide file tree
Showing 12 changed files with 363 additions and 15 deletions.
65 changes: 50 additions & 15 deletions juriscraper/pacer/download_confirmation_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from ..lib.log_tools import make_default_logger
from ..lib.string_utils import clean_string, convert_date_string, force_unicode
from .reports import BaseReport
from .utils import is_pdf, make_docs1_url
from .utils import is_pdf, make_doc1_url, make_docs1_url

logger = make_default_logger()

Expand All @@ -17,6 +17,14 @@ class DownloadConfirmationPage(BaseReport):
def __init__(self, court_id, pacer_session=None):
super().__init__(court_id, pacer_session)

self.is_appellate = False
if self.court_id[-1].isdigit() or self.court_id in [
"cadc",
"cafc",
"cavc",
]:
self.is_appellate = True

def query(self, pacer_doc_id):
"""Query the "confirmation download page" endpoint and set the results
to self.response.
Expand All @@ -29,8 +37,12 @@ def query(self, pacer_doc_id):
self.session is not None
), "session attribute of DownloadConfirmationPage cannot be None."

# Make the NDA document URL
url = make_docs1_url(self.court_id, pacer_doc_id, True)
if self.is_appellate:
# Make the appellate document URL
url = make_docs1_url(self.court_id, pacer_doc_id, True)
else:
# Make the district/bankruptcy document URL
url = make_doc1_url(self.court_id, pacer_doc_id, True)

logger.info("Querying the confirmation page endpoint at URL: %s", url)
self.response = self.session.get(url)
Expand Down Expand Up @@ -59,21 +71,35 @@ def data(self):
if self.is_valid is False:
return {}

document_number = self._get_document_number()
if document_number is None:
if not self._is_a_receipt_page():
# Abort. If we cannot get a document number return a empy dict.
# It's not a valid confirmation page.
return {}

return {
"document_number": document_number,
"document_number": self._get_document_number(),
"docket_number": self._get_docket_number(),
"cost": self._get_document_cost(),
"billable_pages": self._get_billable_pages(),
"document_description": self._get_document_description(),
"transaction_date": self._get_transaction_date(),
}

def _is_a_receipt_page(self) -> bool:
"""Check if this is a valid download confirmation page for a district
bankruptcy or appellate court.
:return: True if is a valid page, otherwise False.
"""

try:
transaction_str = self.tree.re_xpath(
'//*[re:match(text(), "Transaction Receipt")]'
)[0]
except IndexError:
return False
return True

def _get_document_number(self) -> Optional[str]:
"""Get the document number for an item.
Expand Down Expand Up @@ -107,7 +133,7 @@ def _get_document_cost(self) -> Optional[str]:
return None

if cost_str:
return cost_str
return clean_string(cost_str)
return None

def _get_docket_number(self) -> Optional[str]:
Expand All @@ -117,16 +143,25 @@ def _get_docket_number(self) -> Optional[str]:
"""

try:
document_and_case_number = self.tree.xpath(
'//strong[contains(., "Document: PDF Document")]'
)[0].text_content()
if self.is_appellate:
document_and_case_number = self.tree.xpath(
'//strong[contains(., "Document: PDF Document")]'
)[0].text_content()
else:
docket_number = self.tree.re_xpath(
'//*[re:match(text(), "Case Number:")]/'
"/ancestor::th[1]/following-sibling::td[1]/font[1]"
)[0].text_content()

except IndexError:
return None

regex = r"Case:([^\,]*)"
docket_number = re.findall(regex, document_and_case_number)
if self.is_appellate:
regex = r"Case:([^\,]*)"
docket_number = re.findall(regex, document_and_case_number)[0]

if docket_number:
return clean_string(docket_number[0])
return clean_string(docket_number)
return None

def _get_billable_pages(self) -> Optional[str]:
Expand All @@ -143,7 +178,7 @@ def _get_billable_pages(self) -> Optional[str]:
return None

if billable_pages_str:
return billable_pages_str
return clean_string(billable_pages_str)
return None

def _get_document_description(self) -> Optional[str]:
Expand All @@ -160,7 +195,7 @@ def _get_document_description(self) -> Optional[str]:
return None

if document_description_str:
return document_description_str
return clean_string(document_description_str)
return None

def _get_transaction_date(self) -> Optional[str]:
Expand Down
60 changes: 60 additions & 0 deletions tests/examples/pacer/confirmation_pages/caeb_1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@

<!-- saved from url=(0061)https://ecf.caeb.uscourts.gov/doc1/032031165612?caseid=640228 -->
<html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><link rel="shortcut icon" href="https://ecf.caeb.uscourts.gov/favicon.ico"><title>CM/ECF LIVE - U.S. Bankruptcy Court:caeb</title>
<script type="text/javascript">document.cookie = "PRTYPE=web; path=/;"</script> <script>var default_base_path = "/"; </script> <link rel="stylesheet" type="text/css" href="./caeb_1_files/default.css"><script type="text/javascript" src="./caeb_1_files/core.js"></script><script type="text/javascript" src="./caeb_1_files/DisableAJTALinks.js"></script><script type="text/javascript">if (top!=self) {top.location.replace(location.href);}</script><script>var default_base_path = "/"; </script></head><body bgcolor="FAF0E6" text="000000" onload="SetFocus()">
<div class="noprint">
<div id="topmenu" class="yuimenubar"><div class="bd">
<img id="cmecfLogo" class="cmecfLogo" src="./caeb_1_files/logo-cmecf-sm.png" alt="CM/ECF" title="">
<ul class="first-of-type">

<li class="yuimenubaritem"><a class="yuimenubaritemlabel" href="https://ecf.caeb.uscourts.gov/cgi-bin/iquery.pl">Query</a></li>
<li class="yuimenubaritem"><a class="yuimenubaritemlabel" href="https://ecf.caeb.uscourts.gov/cgi-bin/DisplayMenu.pl?Reports&amp;id=-1">Reports <div class="spritedownarrow"></div></a></li>
<li class="yuimenubaritem"><a class="yuimenubaritemlabel" href="https://ecf.caeb.uscourts.gov/cgi-bin/DisplayMenu.pl?Utilities&amp;id=-1">Utilities <div class="spritedownarrow"></div></a></li>
<li class="yuimenubaritem"><a class="yuimenubaritemlabel" onclick="CMECF.MainMenu.showHelpPage(&#39;&#39;); return false">Help</a></li>
<li class="yuimenubaritem"><a class="yuimenubaritemlabel" href="https://ecf.caeb.uscourts.gov/cgi-bin/login.pl?logout">Log Out</a></li><li class="yuimenubaritem" id="placeholderForAlertsIcon"></li>
</ul></div>
<hr class="hrmenuseparator"></div></div>

<script type="text/javascript">
callCreateMenu=function(){
var fn = "CMECF.MainMenu.renderSimpleMenu";
if(typeof CMECF.MainMenu.renderSimpleMenu == 'function') {
CMECF.MainMenu.renderSimpleMenu();
}
}
if (navigator.appVersion.indexOf("MSIE")==-1){window.setTimeout( function(){ callCreateMenu(); }, 1);}else{CMECF.util.Event.addListener(window, "load", callCreateMenu());}</script> <div id="cmecfMainContent" style="height: 699px;"><input type="hidden" id="cmecfMainContentScroll" value="0"><script language="JavaScript">
var IsForm = false;
var FirstField;
function SetFocus() {
if(IsForm) {
if(FirstField) {
var ind = FirstField.indexOf('document.',0);
if(ind == 0)
{
eval(FirstField);
}
else
{
var Code = "document.forms[0]."+FirstField+".focus();";
eval(Code);
}
} else {
var Cnt = 0;
while(document.forms[0].elements[Cnt] != null) {
try {
if(document.forms[0].elements[Cnt].type != "hidden" &&
!document.forms[0].elements[Cnt].disabled &&
!document.forms[0].elements[Cnt].readOnly) {
document.forms[0].elements[Cnt].focus();
break;
}
}
catch(e) {}
Cnt += 1;
}
}
}
return(true);
}
</script>
&nbsp;<p>To accept charges shown below, click on the 'View Document' button, otherwise click the 'Back' button on your browser.</p><form method="POST" action="https://ecf.caeb.uscourts.gov/doc1/032131165612" onsubmit="goDLS(&#39;/doc1/032131165612&#39;,&#39;640228&#39;,&#39;&#39;,&#39;1&#39;,&#39;&#39;,&#39;&#39;,&#39;&#39;,&#39;&#39;,&#39;&#39;,&#39;&#39;);return(false)"><hr><center><table border="1" bgcolor="white" width="400"><tbody><tr><th colspan="4"><font size="+1" color="DARKRED">PACER Service Center </font></th></tr><tr><th colspan="4"><font color="DARKBLUE">Transaction Receipt </font></th></tr><tr></tr><tr></tr><tr><td colspan="4" align="CENTER"><font size="-1" color="DARKBLUE">Thu Mar 2 14:10:07 2023</font></td></tr><tr><th align="LEFT"><font size="-1" color="DARKBLUE"> Pacer Login: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> jesus13law </font></td><th align="LEFT"><font size="-1" color="DARKBLUE"> Client Code: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> </font></td></tr><tr><th align="LEFT"><font size="-1" color="DARKBLUE"> Description: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> Image:64-0 </font></td><th align="LEFT"><font size="-1" color="DARKBLUE"> Case Number: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> 20-10691 </font></td></tr><tr><th align="LEFT"><font size="-1" color="DARKBLUE"> Billable Pages: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> 5 </font></td><th align="LEFT"><font size="-1" color="DARKBLUE"> Cost: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> 0.50 </font></td></tr><tr></tr><tr></tr></tbody></table></center><input type="submit" value="View Document"><div class="recap-banner"><a title="Document is available for free in the RECAP Archive." href="https://storage.courtlistener.com/recap/gov.uscourts.caeb.640228/gov.uscourts.caeb.640228.64.0.pdf"><img src="chrome-extension://oiillickanjlaeghobeeknbddaonmjnc/assets/images/icon-16.png"> Get this document for free from the RECAP Archive.</a></div></form></div><script>document.createElement("form").__proto__.submit = function () { this.id = "form" + new Date().getTime(); window.postMessage({id: this.id}, "*");};</script></body></html>
8 changes: 8 additions & 0 deletions tests/examples/pacer/confirmation_pages/caeb_1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"billable_pages": "5",
"cost": "0.50",
"docket_number": "20-10691",
"document_description": "Image:64-0",
"document_number": null,
"transaction_date": "2023-03-02T14:10:07"
}
48 changes: 48 additions & 0 deletions tests/examples/pacer/confirmation_pages/cand_1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@

<!-- saved from url=(0047)https://ecf.cand.uscourts.gov/doc1/035122812318 -->
<html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><link rel="shortcut icon" href="https://ecf.cand.uscourts.gov/favicon.ico"><title>CAND-ECF</title>
<script type="text/javascript">var default_base_path = "/"; </script><script type="text/javascript">if (top!=self) {top.location.replace(location.href);}</script><link rel="stylesheet" type="text/css" href="./CAND-ECF_files/default.css"><script type="text/javascript" src="./CAND-ECF_files/core.js"></script><link rel="stylesheet" type="text/css" href="./CAND-ECF_files/print.css" media="print"><script type="text/javascript" src="./CAND-ECF_files/menu.pl.download"></script></head><body bgcolor="CCFFFF" text="000000" onload="SetFocus()"><iframe id="_yuiResizeMonitor" style="position: absolute; visibility: visible; width: 10em; height: 10em; top: -160px; left: -160px; border-width: 0px;" src="./CAND-ECF_files/saved_resource.html"></iframe> <div id="topmenu" class="yuimenubar yui-module yui-overlay visible" style="position: static; display: block; z-index: 30; visibility: visible;">
<div class="bd"><img src="./CAND-ECF_files/logo-cmecf-sm.png" class="cmecfLogo" id="cmecfLogo" alt="CM/ECF" title="">
<ul class="first-of-type">
<li class="yuimenubaritem first-of-type" id="yui-gen0" groupindex="0" index="0"><a class="yuimenubaritemlabel" href="https://ecf.cand.uscourts.gov/cgi-bin/iquery.pl"><u>Q</u>uery</a></li>
<li class="yuimenubaritem yuimenubaritem-hassubmenu" id="yui-gen1" groupindex="0" index="1"><a class="yuimenubaritemlabel yuimenubaritemlabel-hassubmenu" href="https://ecf.cand.uscourts.gov/cgi-bin/DisplayMenu.pl?Reports">Reports <div class="spritedownarrow"></div></a></li>
<li class="yuimenubaritem yuimenubaritem-hassubmenu" id="yui-gen2" groupindex="0" index="2"><a class="yuimenubaritemlabel yuimenubaritemlabel-hassubmenu" href="https://ecf.cand.uscourts.gov/cgi-bin/DisplayMenu.pl?Utilities"><u>U</u>tilities <div class="spritedownarrow"></div></a></li>
<li class="yuimenubaritem" id="yui-gen3" groupindex="0" index="3">
<a class="yuimenubaritemlabel" onclick="CMECF.MainMenu.showHelpPage(); return false">Help</a></li>

<li class="yuimenubaritem" id="yui-gen4" groupindex="0" index="4"><a class="yuimenubaritemlabel" href="https://ecf.cand.uscourts.gov/cgi-bin/login.pl?logout">Log Out</a></li></ul><hr class="hrmenuseparator"></div></div><script type="text/javascript">if (navigator.appVersion.indexOf("MSIE")==-1){window.setTimeout(CMECF.MainMenu.createMenu, 0);}else{CMECF.util.Event.addListener(window, "load", CMECF.MainMenu.createMenu);}</script> <div id="cmecfMainContent" style="height: 680px;"><input type="hidden" id="cmecfMainContentScroll" value="0"><script language="JavaScript">
var IsForm = false;
var FirstField;
function SetFocus() {
if(IsForm) {
if(FirstField) {
var ind = FirstField.indexOf('document.',0);
if(ind == 0)
{
eval(FirstField);
}
else
{
var Code = "document.forms[0]."+FirstField+".focus();";
eval(Code);
}
} else {
var Cnt = 0;
while(document.forms[0].elements[Cnt] != null) {
try {
if(document.forms[0].elements[Cnt].type != "hidden" &&
!document.forms[0].elements[Cnt].disabled &&
!document.forms[0].elements[Cnt].readOnly) {
document.forms[0].elements[Cnt].focus();
break;
}
}
catch(e) {}
Cnt += 1;
}
}
}
return(true);
}
</script>
<p>To accept charges shown below, click on the 'View Document' button, otherwise click the 'Back' button on your browser.</p><form method="post" action="https://ecf.cand.uscourts.gov/doc1/035122812318" onsubmit="goDLS(&#39;/doc1/035122812318&#39;,&#39;330489&#39;,&#39;2369&#39;,&#39;1&#39;,&#39;&#39;,&#39;1&#39;,&#39;&#39;,&#39;&#39;,&#39;&#39;);return(false);"><hr><center><table border="1" bgcolor="white" width="400"><tbody><tr><th colspan="4"><font size="+1" color="DARKRED">PACER Service Center </font></th></tr><tr><th colspan="4"><font color="DARKBLUE">Transaction Receipt </font></th></tr><tr></tr><tr></tr><tr><td colspan="4" align="CENTER"><font size="-1" color="DARKBLUE">Thu Mar 2 13:53:33 2023</font></td></tr><tr><th align="LEFT"><font size="-1" color="DARKBLUE"> Pacer Login: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> jesus13law </font></td><th align="LEFT"><font size="-1" color="DARKBLUE"> Client Code: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> </font></td></tr><tr><th align="LEFT"><font size="-1" color="DARKBLUE"> Description: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> Image670-0 </font></td><th align="LEFT"><font size="-1" color="DARKBLUE"> Case Number: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> 3:18-cv-04865-EMC</font></td></tr><tr><th align="LEFT"><font size="-1" color="DARKBLUE"> Billable Pages: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> 1 </font></td><th align="LEFT"><font size="-1" color="DARKBLUE"> Cost: </font></th><td align="LEFT"><font size="-1" color="DARKBLUE"> 0.10 </font></td></tr><tr></tr><tr></tr></tbody></table></center><input type="submit" value="View Document"></form></div><script>document.createElement("form").__proto__.submit = function () { this.id = "form" + new Date().getTime(); window.postMessage({id: this.id}, "*");};</script></body></html>
8 changes: 8 additions & 0 deletions tests/examples/pacer/confirmation_pages/cand_1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"billable_pages": "1",
"cost": "0.10",
"docket_number": "3:18-cv-04865-EMC",
"document_description": "Image670-0",
"document_number": null,
"transaction_date": "2023-03-02T13:53:33"
}
Loading

0 comments on commit f33559f

Please sign in to comment.