From d1a47570065f2d71d6f2f27a88d29f1077f68b10 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Mon, 11 Nov 2024 20:35:36 -0500 Subject: [PATCH] fix(kan, kanctapp): scraper now apply filters Solves #1222 - scraper was not applying filters; wrong data must be cleaned on the DB level --- .../opinions/united_states/state/kan_p.py | 94 +++++++++++-------- .../opinions/united_states/state/kan_u.py | 7 +- .../united_states/state/kanctapp_p.py | 7 +- .../united_states/state/kanctapp_u.py | 7 +- .../united_states/kanctapp_p_example.html | 30 ++---- .../united_states/kanctapp_u_example.html | 30 ++---- 6 files changed, 79 insertions(+), 96 deletions(-) diff --git a/juriscraper/opinions/united_states/state/kan_p.py b/juriscraper/opinions/united_states/state/kan_p.py index ad093ba53..424276e39 100644 --- a/juriscraper/opinions/united_states/state/kan_p.py +++ b/juriscraper/opinions/united_states/state/kan_p.py @@ -1,42 +1,81 @@ # Scraper for Kansas Supreme Court (published) # CourtID: kan_p +from juriscraper.AbstractSite import logger from juriscraper.OpinionSiteLinear import OpinionSiteLinear class Site(OpinionSiteLinear): + court_filter = "Supreme Court" + status_filter = "Published" + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.court_id = self.__module__ - self.url = "https://www.kscourts.org/Cases-Decisions/Decisions" + self.url = "https://www.kscourts.gov/Cases-Decisions/Decisions" self.request["verify"] = False - self.status = "Published" - self.court = "Supreme Court" + self.request["headers"].update( + { + "Referer": "https://kscourts.gov/Cases-Decisions/Decisions", + "Host": "kscourts.gov", + "Origin": "https://kscourts.gov", + } + ) + + def _process_html(self): + if not self.test_mode_enabled(): + # Loading from a fresh session causes an error page + self.html = self._download() + self._update_parameters() + self.html = self._download() + + for row in self.html.xpath(".//tr"): + date_filed, docket_number, case_name, court, status = row.xpath( + ".//td/a/text()" + ) + if court != self.court_filter or status != self.status_filter: + # Check for bug seen on #1222 + logger.error( + "Filters are not working, we got an opinion from %s", court + ) + continue + + url = row.xpath(".//td/a")[0].get("href") + self.cases.append( + { + "status": status, + "date": date_filed, + "docket": docket_number, + "name": case_name, + "url": url, + } + ) def _update_parameters(self): - """""" + """Apply filters through a form data POST request""" + self.method = "POST" view_state = self.html.xpath("//input[@id='__VIEWSTATE']")[0].get( "value" ) - VIEWSTATEGENERATOR = self.html.xpath( + view_state_generator = self.html.xpath( "//input[@id='__VIEWSTATEGENERATOR']" )[0].get("value") - CMSCsrfToken = self.html.xpath("//input[@id='__CMSCsrfToken']")[0].get( - "value" - ) - data = { - "__CMSCsrfToken": CMSCsrfToken, + cms_csrf_token = self.html.xpath("//input[@id='__CMSCsrfToken']")[ + 0 + ].get("value") + self.parameters = { + "__CMSCsrfToken": cms_csrf_token, "__EVENTTARGET": "", "__EVENTARGUMENT": "", "lng": "en-US", - "__VIEWSTATEGENERATOR": VIEWSTATEGENERATOR, + "__VIEWSTATEGENERATOR": view_state_generator, "__SCROLLPOSITIONX": "0", - "__SCROLLPOSITIONY": "239", + "__SCROLLPOSITIONY": "0", "p$lt$ctl01$SmartSearchBox3$txtWord_exWatermark_ClientState": "", "p$lt$ctl01$SmartSearchBox3$txtWord": "", "p$lt$zonePagePlaceholder$pageplaceholder$p$lt$ctl02$OpinionFilter1$filterControl$txtSearch": "", - "p$lt$zonePagePlaceholder$pageplaceholder$p$lt$ctl02$OpinionFilter1$filterControl$drpPublished": self.status, - "p$lt$zonePagePlaceholder$pageplaceholder$p$lt$ctl02$OpinionFilter1$filterControl$drpCourt": self.court, + "p$lt$zonePagePlaceholder$pageplaceholder$p$lt$ctl02$OpinionFilter1$filterControl$drpPublished": self.status_filter, + "p$lt$zonePagePlaceholder$pageplaceholder$p$lt$ctl02$OpinionFilter1$filterControl$drpCourt": self.court_filter, "p$lt$zonePagePlaceholder$pageplaceholder$p$lt$ctl02$OpinionFilter1$filterControl$drpSortBy": "Sort By", "p$lt$zonePagePlaceholder$pageplaceholder$p$lt$ctl02$OpinionFilter1$filterControl$btnFilter": "Apply Filters", "p$lt$zonePagePlaceholder$pageplaceholder$p$lt$ctl03$AccordionLayout1$acc_AccordionExtender_ClientState": "-1", @@ -45,30 +84,3 @@ def _update_parameters(self): "p$lt$ctl09$SmartSearchBox1$txtWord": "", "__VIEWSTATE": view_state, } - self.parameters = data - self.method = "POST" - self.request["verify"] = False - - def _process_html(self): - self.method = "POST" - if not self.test_mode_enabled(): - if ( - self.url - == "https://www.kscourts.org/Cases-Decisions/Decisions" - ): - self._update_parameters() - self.html = super()._download() - for row in self.html.xpath(".//tr"): - date_filed, docket_number, case_name, court, status = row.xpath( - ".//td/a/text()" - ) - url = row.xpath(".//td/a")[0].get("href") - self.cases.append( - { - "status": status, - "date": date_filed, - "docket": docket_number, - "name": case_name, - "url": url, - } - ) diff --git a/juriscraper/opinions/united_states/state/kan_u.py b/juriscraper/opinions/united_states/state/kan_u.py index 78225572b..150fc94a5 100644 --- a/juriscraper/opinions/united_states/state/kan_u.py +++ b/juriscraper/opinions/united_states/state/kan_u.py @@ -5,8 +5,5 @@ class Site(kan_p.Site): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.court_id = self.__module__ - self.status = "Unpublished" - self.court = "Supreme Court" + court_filter = "Supreme Court" + status_filter = "Unpublished" diff --git a/juriscraper/opinions/united_states/state/kanctapp_p.py b/juriscraper/opinions/united_states/state/kanctapp_p.py index ce8123b28..237f4e51b 100644 --- a/juriscraper/opinions/united_states/state/kanctapp_p.py +++ b/juriscraper/opinions/united_states/state/kanctapp_p.py @@ -5,8 +5,5 @@ class Site(kan_p.Site): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.court_id = self.__module__ - self.status = "Published" - self.court = "Court of Appeals" + court_filter = "Court of Appeals" + status_filter = "Published" diff --git a/juriscraper/opinions/united_states/state/kanctapp_u.py b/juriscraper/opinions/united_states/state/kanctapp_u.py index c0b577fd7..1155ee16f 100644 --- a/juriscraper/opinions/united_states/state/kanctapp_u.py +++ b/juriscraper/opinions/united_states/state/kanctapp_u.py @@ -5,8 +5,5 @@ class Site(kan_p.Site): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.court_id = self.__module__ - self.status = "Unpublished" - self.court = "Court of Appeals" + court_filter = "Court of Appeals" + status_filter = "Unpublished" diff --git a/tests/examples/opinions/united_states/kanctapp_p_example.html b/tests/examples/opinions/united_states/kanctapp_p_example.html index ec2a525d8..0f0241b22 100644 --- a/tests/examples/opinions/united_states/kanctapp_p_example.html +++ b/tests/examples/opinions/united_states/kanctapp_p_example.html @@ -1660,8 +1660,7 @@

Filed November 3, 2023

v. Phipps - Court - of Appeals + Court of Appeals Published @@ -1690,8 +1689,7 @@

Filed November 3, 2023

v. McCaslin - Court - of Appeals + Court of Appeals Published @@ -1720,8 +1718,7 @@

Filed November 3, 2023

v. Detimore - Court - of Appeals + Court of Appeals Published @@ -1751,8 +1748,7 @@

Filed November 3, 2023

v. Higdon - Court - of Appeals + Court of Appeals Published @@ -1781,8 +1777,7 @@

Filed November 3, 2023

v. Clingerman - Court - of Appeals + Court of Appeals Published @@ -1811,8 +1806,7 @@

Filed November 3, 2023

v. Nelson - Court - of Appeals + Court of Appeals Published @@ -1841,8 +1835,7 @@

Filed November 3, 2023

v. Koerner - Court - of Appeals + Court of Appeals Published @@ -1871,8 +1864,7 @@

Filed November 3, 2023

Wichita, Inc. v. Filardo - Court - of Appeals + Court of Appeals Published @@ -1902,8 +1894,7 @@

Filed November 3, 2023

of Salina, Kansas - Court - of Appeals + Court of Appeals Published @@ -1932,8 +1923,7 @@

Filed November 3, 2023

v. Spilman - Court - of Appeals + Court of Appeals Published diff --git a/tests/examples/opinions/united_states/kanctapp_u_example.html b/tests/examples/opinions/united_states/kanctapp_u_example.html index 5cfe7afd3..50afbc5c2 100644 --- a/tests/examples/opinions/united_states/kanctapp_u_example.html +++ b/tests/examples/opinions/united_states/kanctapp_u_example.html @@ -1659,8 +1659,7 @@

Filed November 3, 2023

v. Elnicki - Court - of Appeals + Court of Appeals Unpublished @@ -1689,8 +1688,7 @@

Filed November 3, 2023

v. State - Court - of Appeals + Court of Appeals Unpublished @@ -1719,8 +1717,7 @@

Filed November 3, 2023

v. Ward - Court - of Appeals + Court of Appeals Unpublished @@ -1749,8 +1746,7 @@

Filed November 3, 2023

re J.S. and K.G. - Court - of Appeals + Court of Appeals Unpublished @@ -1779,8 +1775,7 @@

Filed November 3, 2023

v. Eismann - Court - of Appeals + Court of Appeals Unpublished @@ -1809,8 +1804,7 @@

Filed November 3, 2023

re Adoption of V.A. - Court - of Appeals + Court of Appeals Unpublished @@ -1839,8 +1833,7 @@

Filed November 3, 2023

re S.T. - Court - of Appeals + Court of Appeals Unpublished @@ -1869,8 +1862,7 @@

Filed November 3, 2023

v. State - Court - of Appeals + Court of Appeals Unpublished @@ -1899,8 +1891,7 @@

Filed November 3, 2023

v. Bradley - Court - of Appeals + Court of Appeals Unpublished @@ -1929,8 +1920,7 @@

Filed November 3, 2023

re Marriage of S.D. and L.D. - Court - of Appeals + Court of Appeals Unpublished