Skip to content

Commit

Permalink
feat(pacer.email): improve bankruptcy short description parsing
Browse files Browse the repository at this point in the history
Solves #912, Solves #914

- simplify parsing by getting rid of cases by court groups
- support multi docket NEF parsing: add examples for deb, ctb, mdb, ndb, nhb, paeb, txnb
- support flsb
- correct wrong parsing for vaeb and mdb after double checking on PACER
- updated  paeb_1 example file where creation of example file had broken parsing
  • Loading branch information
grossir committed Dec 17, 2024
1 parent 650e86a commit 1369764
Show file tree
Hide file tree
Showing 34 changed files with 2,063 additions and 228 deletions.
137 changes: 33 additions & 104 deletions juriscraper/pacer/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,116 +534,45 @@ def _get_docket_entries(
return []

def _parse_bankruptcy_short_description(self, subject: str) -> str:
"""Parse the short description of a bankruptcy case from the email
subject. Subjects for bankruptcy varies a lot from court to court.
This function supports parsing the short description for courts with
known examples.
"""Parse the short description of a bankruptcy case from the email subject
The subject contains: the docket number, the case name, the short description
and special characters. The presence and order of these elements varies from
court to court. The short description that we parse out should match the
"Description" on the "History/Document" report on PACER
A special case are multi-docket NEFs, of which we have only seen
2-docket NEFs. These are labelled as "_multi_" on the example files, and we
have only seen "Adversary Cases", so far. The subject will use one
of the case names and one of the docket numbers, and then follow
the general rules
:param subject: The email subject string.
:return: The parsed short description.
"""

# See paeb_3.txt for a test of multi docket NEF
# So far, we have only seen 2-docket NEF
is_multidocket = len(self.docket_numbers) == 2
if len(self.docket_numbers) > 1 and self.court_id != "njb":
logger.error(
"Not parsing description for Bankruptcy Multi Docket NEF for court '%s'",
self.court_id,
extra={
"fingerprint": [
f"{self.court_id}-not-parsing-multi-docket-short-description"
]
},
)
return ""
if "Close Adversary Case" in subject:
return "Close Adversary Case"

short_description = ""
docket_number = self.docket_numbers[0]
case_name = self.case_names[0]

if is_multidocket:
# Docket number / case name from one or both of the 2 cases
# may be used in the subject string
if self.docket_numbers[1] in subject:
docket_number = self.docket_numbers[1]
if self.case_names[1] in subject:
case_name = self.case_names[1]

if self.court_id in [
"cacb",
"ctb",
"cob",
"ianb",
"nyeb",
"txnb",
"okeb",
]:
# In: 6:22-bk-13643-SY Request for courtesy Notice of Electronic Filing (NEF)
# Out: Request for courtesy Notice of Electronic Filing (NEF)
short_description = subject.split(docket_number)[-1]

# Remove docket number traces "-AAA"
regex = r"^-.*?\s"
short_description = re.sub(regex, "", short_description)
elif self.court_id in ["njb", "dcb", "vaeb", "paeb", "mdb", "arwb"]:
# In: Ch-11 19-27439-MBK Determination of Adjournment Request - Hollister Construc
# Out: Determination of Adjournment Request
# In Ch-13 1:24-bk-70534 Meeting of Creditors - Second Non-Appearance; Michael Clayton Lowry
# Out: Meeting of Creditors - Second Non-Appearance
short_description = subject.split(docket_number)[-1]
# Remove docket number traces "-AAA"
# Remove CH after docket and BK after short description for dcb
regex = r"^-.*?\s|C[Hh][\s\d]+|[ (]?B[Kk]( Other)?[) ]?"
short_description = re.sub(regex, "", short_description)
separator = ";" if self.court_id == "arwb" else "-"
short_description = short_description.rsplit(separator, 1)[0]
elif self.court_id == "nysb":
# In: 22-22507-cgm Ch13 Affidavit Re: Gerasimos Stefanitsis
# Out: Affidavit
short_description = subject.split(case_name)[0]
short_description = short_description.replace("Re:", "")
short_description = short_description.split(docket_number)[-1]

# Remove strings starting with "Ch" followed by a number
regex = r"\bCh\d+\b"
short_description = re.sub(regex, "", short_description)

# Remove docket number traces "-AAA"
regex = r"^-.*?\s"
short_description = re.sub(regex, "", short_description)
elif self.court_id in ["pawb", "ndb", "deb", "pamb", "nhb"]:
# In: Ch-7 22-20823-GLT U LOCK INC Reply
# Out: Reply
if case_name in subject:
short_description = subject.split(case_name)[-1]
elif case_name[:18] in subject:
# See deb_2.txt, pamb_1 and pamb_3 for examples
short_description = subject.split(case_name[:18])[-1]
elif (
" and " in case_name and case_name.split(" and ")[0] in subject
):
# See pamb_2.txt
short_description = subject.split(case_name.split(" and ")[0])[
-1
]
elif self.court_id in [
"tnmb",
]:
# In: Docket Order - Continue Hearing (Auto) Ch 13 Jeffery Wayne Lovell and Tiffany Nicole Lovell 1:24-bk-01377
# Out: Docket Order - Continue Hearing (Auto) Ch 13
if case_name in subject:
short_description = subject.split(case_name)[0]
else:
logger.error(
"Short description has no parsing for bankruptcy court '%s'",
self.court_id,
extra={
"fingerprint": [
f"{self.court_id}-not-parsing-short-description"
]
},
)
for part in self.docket_numbers + self.case_names:
subject = subject.replace(part, " ")

# Sometimes the full case name is not used in the `subject`
# Some courts use a 18 character limit
# See deb_2.txt, pamb_1 and pamb_3 for examples
for case_name in self.case_names:
subject = subject.replace(case_name[:18].strip(), " ")
subject = subject.replace(case_name.split(" and ")[0], " ")

# Deletes:
# - extra docket number 'components'
# - Chapter component
# - "NEF: " placehodler
regex_cleanup = r"(\-[A-Z]{2,})|(\-[a-z]{2,})|(C[Hh](apter)?[- ]?(13|7|9|11))|(NEF:? )"
subject = re.sub(regex_cleanup, " ", subject)
subject = subject.strip(" -;:, ")
# some courts use "Re: {case name}"
short_description = re.sub("( Re$)|(^Re:? )", "", subject)

return short_description

Expand Down
54 changes: 54 additions & 0 deletions tests/examples/pacer/nef/s3/ctb_multi_1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"appellate": false,
"contains_attachments": true,
"court_id": "ctb",
"dockets": [
{
"case_name": "Lotto v. U.S. BANK TRUST NATIONAL ASSOCIATION NOT IN ITS I",
"date_filed": null,
"docket_entries": [
{
"date_filed": "2024-11-21",
"description": "Adversary case 24-03015. Complaint (21 (Validity, priority or extent of lien or other interest in property)) (81 (Subordination of claim or interest)) (91 (Declaratory judgment)) (02 (Other (e.g. other actions that would have been brought in state court if unrelated to bankruptcy))) (72 (Injunctive relief - other)) filed by Michael J.FirstName LastName on behalf of Michael Henry Lotto against U.S. BANK TRUST NATIONAL ASSOCIATION NOT IN ITS INDIVIDUAL CAPACITY BUT SOLELY AS OWNER TRUSTEE FOR RCF 2 ACQUISITION TRUST, Selene Finance LP, Discover Bank. Receipt NotDue Fee Amount $350. Fee Not Due. (Attachments: (1) Form B1040 - Adversary Proceeding Cover Sheet (2) Exhibit Exhibits A through F) (Habib, Michael)",
"document_number": "1",
"document_url": "https://ecf.ctb.uscourts.gov/doc1/040015374935?pdf_header=&magic_num=60081811&de_seq_num=3&caseid=315593",
"pacer_case_id": "315593",
"pacer_doc_id": "040015374935",
"pacer_magic_num": "60081811",
"pacer_seq_no": "3",
"short_description": "Complaint"
}
],
"docket_number": "24-03015",
"federal_defendant_number": null,
"federal_dn_case_type": null,
"federal_dn_judge_initials_assigned": null,
"federal_dn_judge_initials_referred": null,
"federal_dn_office_code": null
},
{
"case_name": "Michael Henry Lotto",
"date_filed": null,
"docket_entries": [
{
"date_filed": "2024-11-21",
"description": "Adversary case 24-03015. Complaint (21 (Validity, priority or extent of lien or other interest in property)) (81 (Subordination of claim or interest)) (91 (Declaratory judgment)) (02 (Other (e.g. other actions that would have been brought in state court if unrelated to bankruptcy))) (72 (Injunctive relief - other)) filed by Michael J.FirstName LastName on behalf of Michael Henry Lotto against U.S. BANK TRUST NATIONAL ASSOCIATION NOT IN ITS INDIVIDUAL CAPACITY BUT SOLELY AS OWNER TRUSTEE FOR RCF 2 ACQUISITION TRUST, Selene Finance LP, Discover Bank. Receipt NotDue Fee Amount $350. Fee Not Due. (Attachments: (1) Form B1040 - Adversary Proceeding Cover Sheet (2) Exhibit Exhibits A through F) (Habib, Michael)",
"document_number": "69",
"document_url": "https://ecf.ctb.uscourts.gov/doc1/040015374938?pdf_header=&magic_num=34452225&de_seq_num=185&caseid=314759",
"pacer_case_id": "314759",
"pacer_doc_id": "040015374938",
"pacer_magic_num": "34452225",
"pacer_seq_no": "185",
"short_description": "Complaint"
}
],
"docket_number": "24-30774",
"federal_defendant_number": null,
"federal_dn_case_type": null,
"federal_dn_judge_initials_assigned": null,
"federal_dn_judge_initials_referred": null,
"federal_dn_office_code": null
}
],
"email_recipients": []
}
148 changes: 148 additions & 0 deletions tests/examples/pacer/nef/s3/ctb_multi_1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
Return-Path: <[email protected]>
Received: from icmecf101.gtwy.uscourts.gov (icmecf101.gtwy.uscourts.gov [199.107.16.200])
by inbound-smtp.us-west-2.amazonaws.com with SMTP id k38b0456po5ke6jfm75uh4a3tc5aaur56gb8beo1
for [email protected];
Fri, 22 Nov 2024 03:05:50 +0000 (UTC)
X-SES-Spam-Verdict: PASS
X-SES-Virus-Verdict: PASS
Received-SPF: pass (spfCheck: domain of ctb.uscourts.gov designates 199.107.16.200 as permitted sender) client-ip=199.107.16.200; [email protected]; helo=icmecf101.gtwy.uscourts.gov;
Authentication-Results: amazonses.com;
spf=pass (spfCheck: domain of ctb.uscourts.gov designates 199.107.16.200 as permitted sender) client-ip=199.107.16.200; [email protected]; helo=icmecf101.gtwy.uscourts.gov;
dkim=pass [email protected];
dmarc=pass header.from=ctb.uscourts.gov;
X-SES-RECEIPT: AEFBQUFBQUFBQUFHbzBHb2lFUUZhNEtjTzlTZEpWWitjZFNzWkFqNWNndi82T1p0Y3dvdHhpb3pwNDhramc3Rkk2RVNsQVBMc1UrYWszOHIrVjh5d3dYelplUUVBclRvM1FxeTVtWkdlamViZTBUNXJveDZXN3dBNkkzVWZqdkVJQjZGaDJKOUd0REx5S3ZWMXFCSVMrRG04VkVuZ3lBQzlmRGVqMU5UalR0amNFQTVNZk42QkdDM2JpZit2VlBvZDl2eDcxLzF5Z1FBUUp0bHp6T08wS0Q4VzQxVGhId0pZWHVSQkt1ZjJac0E3WWp3V08xUlFuZnBxdURHZUdQcitScUMxZnY0VlArR0tkaFQ4V0RKYllrajc5ZE1SNUVXd05UZG91K2NtSFB2S0REN2h6YnF1YVFPWDBnYnBUbzJPUlBRdFdKV0tVVGs9
X-SES-DKIM-SIGNATURE: a=rsa-sha256; q=dns/txt; b=HUFtJui1IUY87ie8tiJHTRRK+FFiLU41Ex4Wc36g/gMTxY8XocRYvdrAI/2iCFvbLlfB6jAxh72bGcKYuRHUHwvTV26g1bwp6IRcY+VdD53vpwo0MjSALnCDE/Tybiye6UqZh/rII3V6nJSHs17TfhDAe9PaYMQzkc8NjxQwGeI=; c=relaxed/simple; s=7v7vs6w47njt4pimodk5mmttbegzsi6n; d=amazonses.com; t=1732244751; v=1; bh=lVDjHVq6Iir6oq29ypBqSxqrAuDIhBJrNUzwKV8MRxM=; h=From:To:Cc:Bcc:Subject:Date:Message-ID:MIME-Version:Content-Type:X-SES-RECEIPT;
DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple;
d=uscourts.gov; [email protected]; l=8269; q=dns/txt;
s=law1e; t=1732244750; x=1763780750;
h=date:x-authentication-warning:mime-version:from:to:
message-id:subject:content-type:x-sbrs:x-remote-ip;
bh=lVDjHVq6Iir6oq29ypBqSxqrAuDIhBJrNUzwKV8MRxM=;
b=LP9HL2sv/m9Q22TTqUNfl1aspzjgOrrceR2DuznMvtXLJ7e+SD4gm0Uk
oZXTQliEvqXP6DxIOVLbPlrl185s1wuZfNp31dnw+b0/gxghr7o1DG5ZM
I1rQpwGczNWHkGuns8JGz6WyY8tAr64VWKMJPKGxV2KU3bi0S4z14l/h7
Fm/XVfESk5eOKRcDaFGPKs4Efq8QyaEXG1eMDyR6jmC5ZPBZwsFDRiCIG
sugF11h7Hkk2wIhVrW3e7itaEWN2uW4OBosDnX7Hq873Rc8sa3HjMzbqE
LIWyi3u7s4L3Vn5iT2QhE+JguKFAQsiXmZNz/oIcLGjoYrLKpuRVEk/qt
A==;
X-SBRS: None
X-REMOTE-IP: 156.119.191.114
Received: from ctbdb.ctb.gtwy.dcn ([156.119.191.114])
by icmecf101.gtwy.uscourts.gov with ESMTP; 21 Nov 2024 22:05:50 -0500
Received: from ctbdb.ctb.gtwy.dcn (localhost.localdomain [127.0.0.1])
by ctbdb.ctb.gtwy.dcn (8.14.7/8.14.7) with ESMTP id 4AM355VE102900;
Thu, 21 Nov 2024 22:05:06 -0500
Received: (from ecf_web@localhost)
by ctbdb.ctb.gtwy.dcn (8.14.7/8.14.4/Submit) id 4AM34g9q101211;
Thu, 21 Nov 2024 22:04:42 -0500
Date: Thu, 21 Nov 2024 22:04:42 -0500
X-Authentication-Warning: ctbdb.ctb.gtwy.dcn: ecf_web set sender to [email protected] using -f
MIME-Version:1.0
From:[email protected]
To:[email protected]
Message-Id:<[email protected]>
Subject:24-03015 Complaint
Content-Type: text/html

<p><strong>***NOTE TO PUBLIC ACCESS USERS*** Judicial Conference of the United States policy permits attorneys of record and parties in a case (including pro se litigants) to receive one free electronic copy of all documents filed electronically, if receipt is required by law or directed by the filer. PACER access fees apply to all other users. To avoid later charges, download a copy of each document during this first viewing. However, if the referenced document is a transcript, the free copy and 30-page limit do not apply.</strong></p>




<p align=center><strong>U.S. Bankruptcy Court</strong></p>

<p align=center><strong>District of Connecticut</strong></p>
Notice of Electronic Filing
<BR>
<div>
<BR>The following transaction was received from LastName, FirstName entered on 11/21/2024 at 10:04 PM EST and filed on 11/21/2024

<BR>



<table border=0 cellspacing=0>
<tr><td><strong>Case Name:</strong>
</td><td>Lotto v. U.S. BANK TRUST NATIONAL ASSOCIATION NOT IN ITS I</td></tr>
<tr><td><strong>Case Number:</strong></td><td><A HREF=https://ecf.ctb.uscourts.gov/cgi-bin/DktRpt.pl?315593>24-03015</A></td></tr>

<tr><td><strong>Document Number:</strong></td>
<td>
<a href='https://ecf.ctb.uscourts.gov/doc1/040015374935?pdf_header=&magic_num=60081811&de_seq_num=3&caseid=315593'>1</a>
</td></tr>
</table>



<table border=0 cellspacing=0>
<tr><td><strong>Case Name:</strong>
</td><td>Michael Henry Lotto </td></tr>
<tr><td><strong>Case Number:</strong></td><td><A HREF=https://ecf.ctb.uscourts.gov/cgi-bin/DktRpt.pl?314759>24-30774</A></td></tr>

<tr><td><strong>Document Number:</strong></td>
<td>
<a href='https://ecf.ctb.uscourts.gov/doc1/040015374938?pdf_header=&magic_num=34452225&de_seq_num=185&caseid=314759'>69</a>
</td></tr>
</table>




<p><strong>Docket Text:</strong>

<BR>
Adversary case 24-03015. Complaint <i></i> (21 (Validity, priority or extent of lien or other interest in property)) (81 (Subordination of claim or interest)) (91 (Declaratory judgment)) (02 (Other (e.g. other actions that would have been brought in state court if unrelated to bankruptcy))) (72 (Injunctive relief - other)) filed by Michael J.FirstName LastName on behalf of Michael Henry Lotto against U.S. BANK TRUST NATIONAL ASSOCIATION NOT IN ITS INDIVIDUAL CAPACITY BUT SOLELY AS OWNER TRUSTEE FOR RCF 2 ACQUISITION TRUST, Selene Finance LP, Discover Bank. Receipt #NotDue Fee Amount $350. Fee Not Due. (Attachments: # (1) Form B1040 - Adversary Proceeding Cover Sheet # (2) Exhibit Exhibits A through F) (Habib, Michael)
</p>

<p>The following document(s) are associated with this transaction:</p>
<table>
<STRONG>Document description:</STRONG>Main Document
<BR><STRONG>Original filename:</STRONG>ADVERSARY COMPLAINT_LOTTO_11.21.2024_FINAL.pdf
<BR><STRONG>Electronic document Stamp:</STRONG>
<BR><TAB>[STAMP bkecfStamp_ID=1018027260 [Date=11/21/2024] [FileNumber=15733343
<BR><TAB>-0] [b62f864b285b5925b6a685449c54a1290aef8c5e400bb26440e2a0575a4a09235
<BR><TAB>2ead8528545efe9e53c2c322ed9ae12e5aa6dae2105c2c6c4a9bf21e97720b1]]
<BR>
<STRONG>Document description:</STRONG> Form B1040 - Adversary Proceeding Cover Sheet
<BR><STRONG>Original filename:</STRONG>C:\fakepath\Form B1040_Adversary Proceeding Cover Sheet_Lotto_11.21.2024.pdf
<BR><STRONG>Electronic document Stamp:</STRONG>
<BR><TAB>[STAMP bkecfStamp_ID=1018027260 [Date=11/21/2024] [FileNumber=15733343
<BR><TAB>-1] [b55c2b39a3993a6d04513dca153f4991202323423d3677b74a1f52bccfd5a365e
<BR><TAB>5ff1bd8c72d560a86d67171fe3c87a43f4f74aa6ec1b9d8183832136b32af6f]]
<BR>
<STRONG>Document description:</STRONG>Exhibit Exhibits A through F
<BR><STRONG>Original filename:</STRONG>C:\fakepath\Exhibits to Complaint_Lotto_11.21.2024.pdf
<BR><STRONG>Electronic document Stamp:</STRONG>
<BR><TAB>[STAMP bkecfStamp_ID=1018027260 [Date=11/21/2024] [FileNumber=15733343
<BR><TAB>-2] [7dcce3f46d124cfc6321d47356cdebe04b98d8f0f0616953f99bdfa65935b73e8
<BR><TAB>75d34a1a674d5a287e3cb72d407cbe19af48e448f2d8a793a18a29837cb1f07]]
<BR>
<STRONG>Document description:</STRONG>Main Document
<BR><STRONG>Original filename:</STRONG>ADVERSARY COMPLAINT_LOTTO_11.21.2024_FINAL.pdf
<BR><STRONG>Electronic document Stamp:</STRONG>
<BR><TAB>[STAMP bkecfStamp_ID=1018027260 [Date=11/21/2024] [FileNumber=15733344
<BR><TAB>-0] [18834182338b3a081bd17cadae69000cd91cda1538665246a3b41313f286209ac
<BR><TAB>9b5e95b87340081b6aea9c62008b08cc285760420546c138093c3677332aa43]]
<BR>
<STRONG>Document description:</STRONG> Form B1040 - Adversary Proceeding Cover Sheet
<BR><STRONG>Original filename:</STRONG>C:\fakepath\Form B1040_Adversary Proceeding Cover Sheet_Lotto_11.21.2024.pdf
<BR><STRONG>Electronic document Stamp:</STRONG>
<BR><TAB>[STAMP bkecfStamp_ID=1018027260 [Date=11/21/2024] [FileNumber=15733344
<BR><TAB>-1] [75c34453ecbcc75d6b285a034223d5ea1bcd2ce26c4c7db30cf4e23500ee06694
<BR><TAB>a5a245a1d55f7a4f9e4873ebba4e17a78a5adb58ea2760cfecfbea1fcb99299]]
<BR>
<STRONG>Document description:</STRONG>Exhibit Exhibits A through F
<BR><STRONG>Original filename:</STRONG>C:\fakepath\Exhibits to Complaint_Lotto_11.21.2024.pdf
<BR><STRONG>Electronic document Stamp:</STRONG>
<BR><TAB>[STAMP bkecfStamp_ID=1018027260 [Date=11/21/2024] [FileNumber=15733344
<BR><TAB>-2] [196a1a6e452b7c81f82369f7c47c0bc62e99f22de7f3cbf784da443f1ee7a3512
<BR><TAB>96e043738965900435410ce007ae9c084b121feb5d585bcc2dad8bbf8d2bc5b]]
<BR>

</table>
</div>




<BR><B>
2 changes: 1 addition & 1 deletion tests/examples/pacer/nef/s3/dcb_2.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"pacer_doc_id": null,
"pacer_magic_num": null,
"pacer_seq_no": null,
"short_description": "Hearing Held"
"short_description": "Hearing Held (BK)"
}
],
"docket_number": "23-00285",
Expand Down
Loading

0 comments on commit 1369764

Please sign in to comment.