From ce26428cab47ea25ab35570154c0a445c79c239d Mon Sep 17 00:00:00 2001 From: ttys0dev <126845556+ttys0dev@users.noreply.github.com> Date: Thu, 5 Oct 2023 23:15:55 -0600 Subject: [PATCH] Migrate http client from requests to httpx async client --- juriscraper/AbstractSite.py | 26 ++-- juriscraper/DeferringList.py | 11 +- juriscraper/OpinionSiteLinearWebDriven.py | 4 +- juriscraper/OpinionSiteWebDriven.py | 4 +- .../OralArgumentSiteLinearWebDriven.py | 4 +- juriscraper/fdsys/FDSysSite.py | 41 +++--- juriscraper/fdsys/scrape_court_names.py | 28 ++-- juriscraper/lasc/http.py | 49 ++++--- juriscraper/lib/cookie_utils.py | 16 +-- juriscraper/lib/html_utils.py | 2 +- juriscraper/lib/importer.py | 2 +- juriscraper/lib/network_utils.py | 20 --- juriscraper/lib/test_utils.py | 26 ++-- .../federal_appellate/ca3.py | 9 +- .../federal_appellate/cadc.py | 8 -- .../united_states_backscrapers/state/nd.py | 12 +- .../pacer/appellate_attachment_page.py | 4 +- juriscraper/pacer/appellate_docket.py | 8 +- juriscraper/pacer/attachment_page.py | 4 +- juriscraper/pacer/case_query.py | 6 +- juriscraper/pacer/case_query_advanced.py | 6 +- juriscraper/pacer/claims_activity.py | 4 +- juriscraper/pacer/claims_register.py | 6 +- juriscraper/pacer/docket_history_report.py | 6 +- juriscraper/pacer/docket_report.py | 4 +- .../pacer/download_confirmation_page.py | 4 +- juriscraper/pacer/free_documents.py | 12 +- juriscraper/pacer/hidden_api.py | 12 +- juriscraper/pacer/http.py | 133 ++++++++++++------ juriscraper/pacer/internet_archive.py | 26 ++-- juriscraper/pacer/list_of_creditors.py | 23 +-- juriscraper/pacer/mobile_query.py | 4 +- juriscraper/pacer/reports.py | 50 ++++--- juriscraper/pacer/rss_feeds.py | 19 +-- juriscraper/pacer/utils.py | 12 +- requirements.txt | 2 +- tests/local/test_PacerNeedLoginTest.py | 26 ++-- tests/local/test_PacerSessionTest.py | 24 ++-- tests/network/test_PacerAuthTest.py | 10 +- tests/network/test_PacerCaseQueryTest.py | 10 +- tests/network/test_PacerDocketReportTest.py | 32 +++-- tests/network/test_PacerFreeOpinionsTest.py | 84 +++++------ tests/network/test_PacerMobileQueryTest.py | 10 +- tests/network/test_PacerSessionTest.py | 8 +- tests/network/test_PacerShowCaseDocApiTest.py | 6 +- 45 files changed, 452 insertions(+), 365 deletions(-) diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py index 53b458d7b..1264b8bfc 100644 --- a/juriscraper/AbstractSite.py +++ b/juriscraper/AbstractSite.py @@ -3,7 +3,7 @@ from datetime import date, datetime import certifi -import requests +import httpx from juriscraper.lib.date_utils import fix_future_year_typo, json_date_handler from juriscraper.lib.exceptions import InsanityException @@ -33,7 +33,7 @@ class AbstractSite: Should not contain lists that can't be sorted by the _date_sort function. """ - def __init__(self, cnt=None): + def __init__(self, cnt=None, user_agent="Juriscraper", **kwargs): super().__init__() # Computed metadata @@ -44,10 +44,12 @@ def __init__(self, cnt=None): self.downloader_executed = False self.cookies = {} self.cnt = cnt or CaseNameTweaker() + self.user_agent = user_agent + kwargs.setdefault("http2", True) self.request = { "verify": certifi.where(), - "session": requests.session(), - "headers": {"User-Agent": "Juriscraper"}, + "session": httpx.AsyncClient(**kwargs), + "headers": {"User-Agent": self.user_agent}, # Disable CDN caching on sites like SCOTUS (ahem) "cache-control": "no-cache, no-store, max-age=1", "parameters": {}, @@ -65,8 +67,8 @@ def __init__(self, cnt=None): self._req_attrs = [] self._all_attrs = [] - def __del__(self): - self.close_session() + async def __aexit__(self): + await self.close_session() def __str__(self): out = [] @@ -84,9 +86,9 @@ def __getitem__(self, i): def __len__(self): return len(self.case_names) - def close_session(self): + async def close_session(self): if self.request["session"]: - self.request["session"].close() + await self.request["session"].aclose() def _make_item(self, i): """Using i, convert a single item into a dict. This is effectively a @@ -344,12 +346,12 @@ def _process_request_parameters(self, parameters={}): del parameters["verify"] self.request["parameters"] = parameters - def _request_url_get(self, url): + async def _request_url_get(self, url): """Execute GET request and assign appropriate request dictionary values """ self.request["url"] = url - self.request["response"] = self.request["session"].get( + self.request["response"] = await self.request["session"].get( url, headers=self.request["headers"], verify=self.request["verify"], @@ -357,10 +359,10 @@ def _request_url_get(self, url): **self.request["parameters"], ) - def _request_url_post(self, url): + async def _request_url_post(self, url): """Execute POST request and assign appropriate request dictionary values""" self.request["url"] = url - self.request["response"] = self.request["session"].post( + self.request["response"] = await self.request["session"].post( url, headers=self.request["headers"], verify=self.request["verify"], diff --git a/juriscraper/DeferringList.py b/juriscraper/DeferringList.py index a05e0ece9..15f3c68c0 100644 --- a/juriscraper/DeferringList.py +++ b/juriscraper/DeferringList.py @@ -1,3 +1,6 @@ +import asyncio +import inspect + from juriscraper.AbstractSite import logger @@ -42,7 +45,13 @@ def __getitem__(self, item): logger.info( f"Getting deferred value from seed: {self._data[item]}" ) - new_val = self._fetching_function(self._data[item]) + if inspect.isawaitable(self._fetching_function): + loop = asyncio.get_event_loop() + new_val = loop.run_until_complete( + self._fetching_function(self._data[item]) + ) + else: + new_val = self._fetching_function(self._data[item]) self._data[item] = new_val self._fetched_items[item] = True return new_val diff --git a/juriscraper/OpinionSiteLinearWebDriven.py b/juriscraper/OpinionSiteLinearWebDriven.py index 1dffadd9e..93ccee667 100644 --- a/juriscraper/OpinionSiteLinearWebDriven.py +++ b/juriscraper/OpinionSiteLinearWebDriven.py @@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) WebDriven.__init__(self, args, kwargs) + async def __aexit__(self): + await self.close_session() + def __del__(self): - self.close_session() self.close_webdriver_session() diff --git a/juriscraper/OpinionSiteWebDriven.py b/juriscraper/OpinionSiteWebDriven.py index 77a2a1b13..15f892bcf 100644 --- a/juriscraper/OpinionSiteWebDriven.py +++ b/juriscraper/OpinionSiteWebDriven.py @@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) WebDriven.__init__(self, args, kwargs) + async def __aexit__(self): + await self.close_session() + def __del__(self): - self.close_session() self.close_webdriver_session() diff --git a/juriscraper/OralArgumentSiteLinearWebDriven.py b/juriscraper/OralArgumentSiteLinearWebDriven.py index 3177d5d39..a316739d0 100644 --- a/juriscraper/OralArgumentSiteLinearWebDriven.py +++ b/juriscraper/OralArgumentSiteLinearWebDriven.py @@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) WebDriven.__init__(self, args, kwargs) + async def __aexit__(self): + await self.close_session() + def __del__(self): - self.close_session() self.close_webdriver_session() diff --git a/juriscraper/fdsys/FDSysSite.py b/juriscraper/fdsys/FDSysSite.py index bd93f92f7..cdfc21c41 100644 --- a/juriscraper/fdsys/FDSysSite.py +++ b/juriscraper/fdsys/FDSysSite.py @@ -5,19 +5,20 @@ from datetime import date from pprint import pprint -import requests +import httpx +from httpx import InvalidURL from lxml import etree -from requests.exceptions import MissingSchema from juriscraper.AbstractSite import AbstractSite -def get_tree(url): +async def get_tree(url, **kwargs): try: - response = requests.get(url, stream=True) - response.raw.decode_content = True - return etree.parse(response.raw) - except MissingSchema: + kwargs.setdefault("http2", True) + async with httpx.AsyncClient(**kwargs) as client: + response = await client.get(url) + return etree.parse(await response.aread()) + except InvalidURL: return etree.parse(url) @@ -160,23 +161,25 @@ def __getitem__(self, i): def __len__(self): return len(xpath(self.html, "//s:loc/text()")) - def save_mods_file(self, url): + async def save_mods_file(self, url, **kwargs): mods_url = FDSysModsContent._get_mods_file_url(url) name = "-".join(mods_url.split("/")[-2].split("-")[1:]) - with open(f"./examples/2006/{name}.xml", "w") as handle: - response = requests.get(mods_url, stream=True) - for block in response.iter_content(1024): - handle.write(block) - - def _download(self, request_dict={}): + with open(f"./examples/2006/{name}.xml", "wb") as handle: + kwargs.setdefault("http2", True) + async with httpx.AsyncClient(**kwargs) as client: + async with client.stream("GET", mods_url) as response: + async for block in response.aiter_bytes(): + handle.write(block) + + async def _download(self, request_dict={}): """ it actually builds an XML tree """ - return get_tree(self.url) + return await get_tree(self.url) - def _download_backwards(self, year): + async def _download_backwards(self, year): self.url = self.base_url.format(year=year) - self.html = self._download() + self.html = await self._download() if self.html is not None: # Setting status is important because it prevents the download # function from being run a second time by the parse method. @@ -185,10 +188,10 @@ def _download_backwards(self, year): def _check_sanity(self): pass - def parse(self): + async def parse(self): if self.status is None: # Run the downloader if it hasn't been run already - self.html = self._download() + self.html = await self._download() return self diff --git a/juriscraper/fdsys/scrape_court_names.py b/juriscraper/fdsys/scrape_court_names.py index 63c08fa94..b1d8c90e4 100644 --- a/juriscraper/fdsys/scrape_court_names.py +++ b/juriscraper/fdsys/scrape_court_names.py @@ -1,12 +1,17 @@ +import asyncio import json from pprint import pprint -import requests +import httpx from lxml import etree, html -def get_court_names(): - response = requests.get("https://www.courtlistener.com/api/jurisdictions/") +async def get_court_names(**kwargs): + kwargs.setdefault("http2", True) + async with httpx.AsyncClient(**kwargs) as client: + response = await client.get( + "https://www.courtlistener.com/api/jurisdictions/" + ) tree = html.fromstring(response.text) data = dict() @@ -21,13 +26,14 @@ def get_court_names(): json.dump(data, f) -def get_fdsys_court_names(): - response = requests.get( - "https://www.gpo.gov/smap/fdsys/sitemap_2014/2014_USCOURTS_sitemap.xml", - stream=True, - ) - response.raw.decode_content = True - tree = etree.parse(response.raw) +async def get_fdsys_court_names(**kwargs): + kwargs.setdefault("http2", True) + async with httpx.AsyncClient(**kwargs) as client: + response = await client.get( + "https://www.gpo.gov/smap/fdsys/sitemap_2014/2014_USCOURTS_sitemap.xml" + ) + tree = etree.parse(await response.aread()) + data = dict() for url in tree.xpath( @@ -47,4 +53,4 @@ def get_fdsys_court_names(): if __name__ == "__main__": # get_court_names() - get_fdsys_court_names() + asyncio.run(get_fdsys_court_names()) diff --git a/juriscraper/lasc/http.py b/juriscraper/lasc/http.py index 97637a6bc..3135acd41 100644 --- a/juriscraper/lasc/http.py +++ b/juriscraper/lasc/http.py @@ -1,22 +1,20 @@ -import requests +import httpx from lxml.html import fromstring from ..lib.log_tools import make_default_logger -requests.packages.urllib3.disable_warnings( - requests.packages.urllib3.exceptions.InsecureRequestWarning -) - logger = make_default_logger() -class LASCSession(requests.Session): +class LASCSession(httpx.AsyncClient): """ - A requests.Session object with special tooling to handle the Los Angeles + A httpx.AsyncClient object with special tooling to handle the Los Angeles Superior Court Media Access portal. """ - def __init__(self, username=None, password=None): + def __init__( + self, username=None, password=None, user_agent="Juriscraper", **kwargs + ): """ Instantiate a new LASC HTTP Session with some Juriscraper defaults. This method requires credentials from the media access portal. @@ -25,7 +23,7 @@ def __init__(self, username=None, password=None): :param password: MAP password :return: A LASCSession object """ - super().__init__() + super().__init__(**kwargs) self.html = None @@ -53,34 +51,35 @@ def __init__(self, username=None, password=None): "password": password, "request_type": "RESPONSE", } + self.user_agent = user_agent self.headers = { "Origin": ms_base_url, - "User-Agent": "Juriscraper", + "User-Agent": self.user_agent, } - def get(self, url, auto_login=False, **kwargs): - """Overrides request.Session.get with session retry logic. + async def get(self, url, auto_login=False, **kwargs): + """Overrides httpx.AsyncClient.get with session retry logic. :param url: url string to GET :param auto_login: Whether the auto-login procedure should happen. - :return: requests.Response + :return: httpx.Response """ kwargs.setdefault("timeout", 30) kwargs.setdefault("params", {"p": "B2C_1_Media-LASC-SUSI"}) - return super().get(url, **kwargs) + return await super().get(url, **kwargs) - def post(self, url, auto_login=False, **kwargs): - """Overrides request.Session.post with session retry logic. + async def post(self, url, auto_login=False, **kwargs): + """Overrides httpx.AsyncClient.post with session retry logic. :param url: url string to GET :param auto_login: Whether the auto-login procedure should happen. - :return: requests.Response + :return: httpx.Response """ kwargs.setdefault("timeout", 30) kwargs.setdefault("params", {"p": "B2C_1_Media-LASC-SUSI"}) - return super().post(url, **kwargs) + return await super().post(url, **kwargs) @staticmethod def _parse_new_html_for_keys(r): @@ -89,7 +88,7 @@ def _parse_new_html_for_keys(r): This method parses the HTML after the first login page and identifies the parameter values required for the next step. - :param r: A request.Response object + :param r: A httpx.Response object :return: A dict containing the needed keys """ html = fromstring(r.text) @@ -103,7 +102,7 @@ def _parse_new_html_for_keys(r): def _check_login(r): """Check that the login succeeded - :param r: A request.Response object + :param r: A httpx.Response object :return: None :raises LASCLoginException """ @@ -121,7 +120,7 @@ def _check_login(r): def _update_header_token(self, r): self.headers["X-CSRF-TOKEN"] = r.text.split("csrf")[1].split('"')[2] - def login(self): + async def login(self): """Log into the LASC Media Access Portal The process is tricky, requiring two GET requests, each of which returns HTML or JSON that is parsed for values to send in a subsequent @@ -326,20 +325,20 @@ def login(self): """ logger.info("Logging into MAP has begun") - r = self.get(self.login_url) + r = await self.get(self.login_url) self._update_header_token(r) # Call part one of Microsoft login API - r = self.post(self.api_url1, data=self.login_data) + r = await self.post(self.api_url1, data=self.login_data) self._check_login(r) # Call part two of Microsoft login API - Redirect - r = self.get(self.api_url2) + r = await self.get(self.api_url2) # Finalize login with post into LA MAP site parsed_keys = self._parse_new_html_for_keys(r) - self.post(self.signin_url, data=parsed_keys) + await self.post(self.signin_url, data=parsed_keys) logger.info("Successfully Logged into MAP") diff --git a/juriscraper/lib/cookie_utils.py b/juriscraper/lib/cookie_utils.py index 639d00e38..73f85cb76 100644 --- a/juriscraper/lib/cookie_utils.py +++ b/juriscraper/lib/cookie_utils.py @@ -1,20 +1,20 @@ -from requests.cookies import RequestsCookieJar +from httpx import Cookies def normalize_cookies(cookies): - """Takes cookies from Selenium or from Python Requests and + """Takes cookies from Selenium or from Python HTTPX and converts them to dict. This throws away information that Selenium otherwise has (like the host and such), but a dict is essentially all we need. """ - requests_cookies = {} + httpx_cookies = {} if type(cookies) == list: # Selenium cookies for cookie in cookies: - requests_cookies[cookie["name"]] = cookie["value"] - elif type(cookies) == RequestsCookieJar: - # Requests cookies. Convert to dict. - requests_cookies = dict(cookies) + httpx_cookies[cookie["name"]] = cookie["value"] + elif type(cookies) == Cookies: + # HTTPX cookies. Convert to dict. + httpx_cookies = dict(cookies) - return requests_cookies + return httpx_cookies diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py index fbb7c3b0f..30dffc07c 100644 --- a/juriscraper/lib/html_utils.py +++ b/juriscraper/lib/html_utils.py @@ -4,11 +4,11 @@ from urllib.parse import urlsplit, urlunsplit import lxml +from httpx import Response from lxml import etree, html from lxml.etree import XMLSyntaxError from lxml.html import HtmlElement, fromstring, html5parser, tostring from lxml.html.clean import Cleaner -from requests import Response try: # Use charset-normalizer for performance to detect the character encoding. diff --git a/juriscraper/lib/importer.py b/juriscraper/lib/importer.py index adafc8fb5..29dbd4a98 100644 --- a/juriscraper/lib/importer.py +++ b/juriscraper/lib/importer.py @@ -1,6 +1,6 @@ import os -from requests import HTTPError +from httpx import HTTPError def build_module_list(court_id): diff --git a/juriscraper/lib/network_utils.py b/juriscraper/lib/network_utils.py index 878ec428f..ea55dcadf 100644 --- a/juriscraper/lib/network_utils.py +++ b/juriscraper/lib/network_utils.py @@ -1,29 +1,9 @@ import random import time -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.poolmanager import PoolManager - from juriscraper.AbstractSite import logger -class SSLAdapter(HTTPAdapter): - """An HTTPS Transport Adapter that uses an arbitrary SSL version.""" - - def __init__(self, ssl_version=None, **kwargs): - self.ssl_version = ssl_version - - super().__init__(**kwargs) - - def init_poolmanager(self, connections, maxsize, block=False): - self.poolmanager = PoolManager( - num_pools=connections, - maxsize=maxsize, - block=block, - ssl_version=self.ssl_version, - ) - - def add_delay(delay=0, deviation=0): """Create a semi-random delay. diff --git a/juriscraper/lib/test_utils.py b/juriscraper/lib/test_utils.py index 5d654b894..f4cfa9cb9 100644 --- a/juriscraper/lib/test_utils.py +++ b/juriscraper/lib/test_utils.py @@ -2,11 +2,16 @@ import sys import warnings -from requests.exceptions import ConnectionError -from requests.models import Request, Response +from httpx import Request, RequestError, Response from .exceptions import SlownessException +try: + # Use charset-normalizer for performance to detect the character encoding. + import charset_normalizer as chardet +except ImportError: + import chardet + WARN_SLOW_SCRAPERS = "CI" in os.environ @@ -36,22 +41,25 @@ def __init__(self, url=None): self.request = self def get(self): - r = Response() try: with open(self.url, mode="rb") as stream: - r._content = stream.read() + content = stream.read() + r = Response( + status_code=200, + request=self.request, + default_encoding=chardet.detect(content)["encoding"], + ) + r._content = content #: Integer Code of responded HTTP Status. - r.status_code = 200 if self.url.endswith("json"): r.headers["content-type"] = "application/json" except OSError as e: - r.status_code = 404 - raise ConnectionError(e) + raise RequestError(e) - r._content_consumed = True + r.is_stream_consumed = True #: Final URL location of Response. - r.url = self.url + r.request.url = self.url # Return the response. return r diff --git a/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca3.py b/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca3.py index 06640af00..e4dc0232e 100644 --- a/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca3.py +++ b/juriscraper/opinions/united_states_backscrapers/federal_appellate/ca3.py @@ -2,7 +2,6 @@ from datetime import date import certifi -import requests from lxml import html from juriscraper.AbstractSite import logger @@ -38,14 +37,14 @@ def _download(self, request_dict={}): html_trees.extend(html_next_trees) return html_trees - def _get_case_html_page(self, html_trees, html_l, request_dict): + async def _get_case_html_page(self, html_trees, html_l, request_dict): """Gets each of the individual case pages""" - s = requests.session() + s = self.request["session"] for case_url in html_l.xpath(self.case_xpath): logger.info(f" Getting sub-page at: {case_url}") - r = s.get( + r = await s.get( case_url, - headers={"User-Agent": "Juriscraper"}, + headers={"User-Agent": self.user_agent}, verify=certifi.where(), timeout=60, **request_dict, diff --git a/juriscraper/opinions/united_states_backscrapers/federal_appellate/cadc.py b/juriscraper/opinions/united_states_backscrapers/federal_appellate/cadc.py index 4e539aace..ebe7bb7ba 100644 --- a/juriscraper/opinions/united_states_backscrapers/federal_appellate/cadc.py +++ b/juriscraper/opinions/united_states_backscrapers/federal_appellate/cadc.py @@ -22,14 +22,6 @@ def __init__(self, *args, **kwargs): ) ] - def _get_adapter_instance(self): - """Unfortunately this court doesn't support modern crypto, so you have - to manually downgrade the crypto it uses. - - See: http://stackoverflow.com/questions/14102416/ - """ - return SSLAdapter(ssl_version=ssl.PROTOCOL_TLSv1) - def _get_case_names(self): return [ e diff --git a/juriscraper/opinions/united_states_backscrapers/state/nd.py b/juriscraper/opinions/united_states_backscrapers/state/nd.py index c68fd2aac..ff73a3d74 100644 --- a/juriscraper/opinions/united_states_backscrapers/state/nd.py +++ b/juriscraper/opinions/united_states_backscrapers/state/nd.py @@ -4,8 +4,6 @@ import re from datetime import date, datetime -import requests - from juriscraper.DeferringList import DeferringList from juriscraper.opinions.united_states.state import nd @@ -19,19 +17,19 @@ def __init__(self, *args, **kwargs): today.strftime("%b%Y") ) - def _get_download_urls(self): + async def _get_download_urls(self): """We use a fetcher and a DeferringList object and a HEAD request to test whether the wpd exists for a case""" - def fetcher(html_link): + async def fetcher(html_link): if self.test_mode_enabled(): return html_link # Can't fetch remote during tests case_number = re.search(r"(\d+)", html_link).group(0) wpd_link = f"http://www.ndcourts.gov/wp/{case_number}.wpd" - r = requests.head( + r = await self.request["session"].head( wpd_link, - allow_redirects=False, - headers={"User-Agent": "Juriscraper"}, + follow_redirects=False, + headers={"User-Agent": self.user_agent}, ) if r.status_code == 200: return wpd_link diff --git a/juriscraper/pacer/appellate_attachment_page.py b/juriscraper/pacer/appellate_attachment_page.py index bb25cbbf1..58d6e75fd 100644 --- a/juriscraper/pacer/appellate_attachment_page.py +++ b/juriscraper/pacer/appellate_attachment_page.py @@ -30,7 +30,7 @@ class AppellateAttachmentPage(BaseReport): def __init__(self, court_id, pacer_session=None): super().__init__(court_id, pacer_session) - def query(self, document_number): + async def query(self, document_number): """Query the "attachment page" endpoint and set the results to self.response. The appellate attachment page uses the a variable called dls_id @@ -48,7 +48,7 @@ def query(self, document_number): document_number = f"{document_number[:3]}0{document_number[4:]}" url = self.url + document_number logger.info("Querying the attachment page endpoint at URL: %s", url) - self.response = self.session.get(url) + self.response = await self.session.get(url) self.parse() def _strip_bad_html_tags_insecure(self, text: str) -> None: diff --git a/juriscraper/pacer/appellate_docket.py b/juriscraper/pacer/appellate_docket.py index ea3accb38..d2355c9bc 100644 --- a/juriscraper/pacer/appellate_docket.py +++ b/juriscraper/pacer/appellate_docket.py @@ -85,7 +85,7 @@ def url(self): "n/beam/servlet/TransportRoom" % self.court_id ) - def query( + async def query( self, docket_number, show_docket_entries=False, @@ -246,7 +246,7 @@ def query( docket_number, query_params, ) - self.response = self.session.get(self.url, params=query_params) + self.response = await self.session.get(self.url, params=query_params) self.parse() def parse(self): @@ -258,7 +258,7 @@ def parse(self): self._clear_caches() super().parse() - def download_pdf(self, pacer_doc_id, pacer_case_id=None): + async def download_pdf(self, pacer_doc_id, pacer_case_id=None): """Download a PDF from an appellate court. :param pacer_case_id: The case ID for the docket @@ -321,7 +321,7 @@ def download_pdf(self, pacer_doc_id, pacer_case_id=None): logger.info( "GETting PDF at URL: %s with params: %s", self.url, query_params ) - r = self.session.get(self.url, params=query_params) + r = await self.session.get(self.url, params=query_params) r.raise_for_status() if is_pdf(r): logger.info( diff --git a/juriscraper/pacer/attachment_page.py b/juriscraper/pacer/attachment_page.py index 36b0385e1..f26eb3d04 100644 --- a/juriscraper/pacer/attachment_page.py +++ b/juriscraper/pacer/attachment_page.py @@ -25,7 +25,7 @@ def __init__(self, court_id, pacer_session=None): # document number, only the attachment numbers. self.is_bankruptcy = self.court_id.endswith("b") - def query(self, document_number): + async def query(self, document_number): """Query the "attachment page" endpoint and set the results to self.response. :param document_number: The internal PACER document ID for the item. @@ -39,7 +39,7 @@ def query(self, document_number): document_number = f"{document_number[:3]}0{document_number[4:]}" url = self.url + document_number logger.info("Querying the attachment page endpoint at URL: %s", url) - self.response = self.session.get(url) + self.response = await self.session.get(url) self.parse() @property diff --git a/juriscraper/pacer/case_query.py b/juriscraper/pacer/case_query.py index f463e79da..49b548c38 100644 --- a/juriscraper/pacer/case_query.py +++ b/juriscraper/pacer/case_query.py @@ -222,7 +222,7 @@ def metadata(self): self._metadata = data return data - def query(self, pacer_case_id): + async def query(self, pacer_case_id): """Use a district court's PACER query function with a known case ID At the top of every district PACER court, there's a button that says, @@ -292,7 +292,9 @@ def query(self, pacer_case_id): pacer_case_id, self.court_id, ) - self.response = self.session.post(f"{self.url}?1-L_1_0-1", data=params) + self.response = await self.session.post( + f"{self.url}?1-L_1_0-1", data=params + ) self.parse() @property diff --git a/juriscraper/pacer/case_query_advanced.py b/juriscraper/pacer/case_query_advanced.py index 91854bff0..a5b1acbdf 100644 --- a/juriscraper/pacer/case_query_advanced.py +++ b/juriscraper/pacer/case_query_advanced.py @@ -143,7 +143,7 @@ def metadata(self): self._metadata = data return data - def query( + async def query( self, name_last="", name_first="", @@ -234,7 +234,9 @@ def query( } ) logger.info("Running advanced case query with params '%s'", params) - self.response = self.session.post(f"{self.url}?1-L_1_0-1", data=params) + self.response = await self.session.post( + f"{self.url}?1-L_1_0-1", data=params + ) self.parse() diff --git a/juriscraper/pacer/claims_activity.py b/juriscraper/pacer/claims_activity.py index 2957e833f..79698159b 100644 --- a/juriscraper/pacer/claims_activity.py +++ b/juriscraper/pacer/claims_activity.py @@ -367,7 +367,7 @@ def _get_label_value_pair_from_string( data = {label: force_unicode(value)} return data - def query( + async def query( self, pacer_case_id: str, docket_number: str, @@ -429,7 +429,7 @@ def query( else: post_param = "1-L_1_0-1" - self.response = self.session.post( + self.response = await self.session.post( f"{self.url}?{post_param}", data=params ) self.parse() diff --git a/juriscraper/pacer/claims_register.py b/juriscraper/pacer/claims_register.py index 36f2ef1ac..24d616ffe 100644 --- a/juriscraper/pacer/claims_register.py +++ b/juriscraper/pacer/claims_register.py @@ -340,7 +340,7 @@ def _parse_history_cell(self, td): history_rows.append(row) return history_rows - def query( + async def query( self, pacer_case_id, docket_number, date_start=None, date_end=None ): """Query the claims register and return the results. @@ -388,5 +388,7 @@ def query( self.court_id, params, ) - self.response = self.session.post(f"{self.url}?1-L_1_0-1", data=params) + self.response = await self.session.post( + f"{self.url}?1-L_1_0-1", data=params + ) self.parse() diff --git a/juriscraper/pacer/docket_history_report.py b/juriscraper/pacer/docket_history_report.py index 675652c24..be4de52b4 100644 --- a/juriscraper/pacer/docket_history_report.py +++ b/juriscraper/pacer/docket_history_report.py @@ -83,7 +83,7 @@ def metadata(self): self._metadata = data return data - def query( + async def query( self, pacer_case_id, query_type="History", @@ -122,7 +122,7 @@ def query( "Getting nonce for docket history report with " "pacer_case_id: %s" % pacer_case_id ) - r = self.session.get(f"{self.url}?{pacer_case_id}") + r = await self.session.get(f"{self.url}?{pacer_case_id}") nonce = get_nonce_from_form(r) query_params = { @@ -137,7 +137,7 @@ def query( "params %s and nonce %s" % (pacer_case_id, query_params, nonce) ) - self.response = self.session.post( + self.response = await self.session.post( f"{self.url}?{nonce}", data=query_params ) self.parse() diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py index d0e4a2429..0af7e62d3 100644 --- a/juriscraper/pacer/docket_report.py +++ b/juriscraper/pacer/docket_report.py @@ -1264,7 +1264,7 @@ def is_adversary_proceeding(self): self._is_adversary_proceeding = adversary_proceeding return adversary_proceeding - def query( + async def query( self, pacer_case_id, date_range_type="Filed", @@ -1377,7 +1377,7 @@ def query( % (pacer_case_id, query_params) ) - self.response = self.session.post( + self.response = await self.session.post( f"{self.url}?1-L_1_0-1", data=query_params ) self.parse() diff --git a/juriscraper/pacer/download_confirmation_page.py b/juriscraper/pacer/download_confirmation_page.py index d012e26a8..c496e09b2 100644 --- a/juriscraper/pacer/download_confirmation_page.py +++ b/juriscraper/pacer/download_confirmation_page.py @@ -25,7 +25,7 @@ def __init__(self, court_id, pacer_session=None): ]: self.is_appellate = True - def query(self, pacer_doc_id): + async def query(self, pacer_doc_id): """Query the "confirmation download page" endpoint and set the results to self.response. @@ -45,7 +45,7 @@ def query(self, pacer_doc_id): url = make_doc1_url(self.court_id, pacer_doc_id, True) logger.info("Querying the confirmation page endpoint at URL: %s", url) - self.response = self.session.get(url) + self.response = await self.session.get(url) if is_pdf(self.response): # Sometimes the PDF document is returned without showing the # download confirmation page, not a valid page to parse. diff --git a/juriscraper/pacer/free_documents.py b/juriscraper/pacer/free_documents.py index 89befd99c..8a5e62aa5 100644 --- a/juriscraper/pacer/free_documents.py +++ b/juriscraper/pacer/free_documents.py @@ -42,7 +42,7 @@ def url(self): f"https://ecf.{self.court_id}.uscourts.gov/cgi-bin/WrtOpRpt.pl" ) - def query(self, start, end, sort="date_filed", day_span=7): + async def query(self, start, end, sort="date_filed", day_span=7): """Query the Free Opinions report one day at a time. :param start: a date object representing the date you want to start at. @@ -76,7 +76,7 @@ def query(self, start, end, sort="date_filed", day_span=7): ) # Get the first page, grab the nonce, and submit using that. - response = self.session.get(self.url) + response = await self.session.get(self.url) nonce = get_nonce_from_form(response) logger.info("Got nonce of %s", nonce) @@ -87,7 +87,9 @@ def query(self, start, end, sort="date_filed", day_span=7): "Key1": self._normalize_sort_param(sort), "all_case_ids": "0", } - response = self.session.post(f"{self.url}?{nonce}", data=data) + response = await self.session.post( + f"{self.url}?{nonce}", data=data + ) responses.append(response) self.responses = responses @@ -105,7 +107,9 @@ def parse(self): set_response_encoding(response) text = clean_html(response.text) tree = get_html_parsed_text(text) - tree.rewrite_links(fix_links_in_lxml_tree, base_href=response.url) + tree.rewrite_links( + fix_links_in_lxml_tree, base_href=str(response.url) + ) self.trees.append(tree) @property diff --git a/juriscraper/pacer/hidden_api.py b/juriscraper/pacer/hidden_api.py index 40a3b20e5..c9bda13e8 100644 --- a/juriscraper/pacer/hidden_api.py +++ b/juriscraper/pacer/hidden_api.py @@ -23,7 +23,7 @@ class PossibleCaseNumberApi(BaseReport): PATH = "cgi-bin/possible_case_numbers.pl" - def query(self, docket_number): + async def query(self, docket_number): """Query the "possible case numbers" endpoint and return the results. :param docket_number: A string representing a docket number @@ -36,7 +36,7 @@ def query(self, docket_number): logger.info( f"Querying the possible case number endpoint at URL: {url}" ) - self.response = self.session.get(url) + self.response = await self.session.get(url) self.parse() def _parse_text(self, text): @@ -203,7 +203,9 @@ def __init__(self, court_id, pacer_session=None): ), "This API is not available at bankruptcy courts." super().__init__(court_id, pacer_session) - def query(self, pacer_case_id, document_number, attachment_number=""): + async def query( + self, pacer_case_id, document_number, attachment_number="" + ): """Query the show_case_doc endpoint and return the normalized doc1 number. @@ -228,7 +230,7 @@ def query(self, pacer_case_id, document_number, attachment_number=""): ) logger.info(f"Querying the show_doc_url endpoint with URL: {url}") # Only do a head request, else we get text content we don't need. - self.response = self.session.head(url, allow_redirects=True) + self.response = await self.session.head(url, follow_redirects=True) self.parse() def _parse_text(self, text): @@ -238,7 +240,7 @@ def _parse_text(self, text): @property def data(self): """Get the URL out of the response object.""" - url = self.response.url + url = str(self.response.url) if "doc1" in url: return get_pacer_doc_id_from_doc1_url(url) else: diff --git a/juriscraper/pacer/http.py b/juriscraper/pacer/http.py index 0f4bc58e9..93f15583a 100644 --- a/juriscraper/pacer/http.py +++ b/juriscraper/pacer/http.py @@ -1,8 +1,8 @@ import json import re +from urllib.parse import unquote -import requests -from requests.packages.urllib3 import exceptions +import httpx from ..lib.exceptions import PacerLoginException from ..lib.html_utils import get_html_parsed_text, get_xml_parsed_text @@ -11,8 +11,6 @@ logger = make_default_logger() -requests.packages.urllib3.disable_warnings(exceptions.InsecureRequestWarning) - def check_if_logged_in_page(content: bytes) -> bool: """Is this a valid HTML page from PACER? @@ -87,9 +85,15 @@ def check_if_logged_in_page(content: bytes) -> bool: return False -class PacerSession(requests.Session): +class UNQUOTED_URL(httpx.URL): + @property + def raw_path(self) -> bytes: + return unquote(super().raw_path.decode("ascii")).encode("ascii") + + +class PacerSession(httpx.AsyncClient): """ - Extension of requests.Session to handle PACER oddities making it easier + Extension of httpx.AsyncClient to handle PACER oddities making it easier for folks to just POST data to PACER endpoints/apis. Also includes utilities for logging into PACER and re-logging in when @@ -99,17 +103,26 @@ class PacerSession(requests.Session): LOGIN_URL = "https://pacer.login.uscourts.gov/services/cso-auth" def __init__( - self, cookies=None, username=None, password=None, client_code=None + self, + cookies=None, + username=None, + password=None, + client_code=None, + user_agent="Juriscraper", + **kwargs, ): """ Instantiate a new PACER API Session with some Juriscraper defaults - :param cookies: an optional RequestsCookieJar object with cookies for the session + :param cookies: an optional httpx.Cookies object with cookies for the session :param username: a PACER account username :param password: a PACER account password :param client_code: an optional PACER client code for the session """ - super().__init__() - self.headers["User-Agent"] = "Juriscraper" + kwargs.setdefault("http2", True) + kwargs.setdefault("follow_redirects", True) + super().__init__(**kwargs) + self.user_agent = user_agent + self.headers["User-Agent"] = self.user_agent self.headers["Referer"] = "https://external" # For CVE-001-FLP. self.verify = False @@ -125,16 +138,47 @@ def __init__( self.client_code = client_code self.additional_request_done = False - def get(self, url, auto_login=True, **kwargs): - """Overrides request.Session.get with session retry logic. + async def get(self, url, auto_login=True, **kwargs): + """Overrides httpx.AsyncClient.get with session retry logic. + + :param url: url string to GET + :param auto_login: Whether the auto-login procedure should happen. + :return: httpx.Response + """ + kwargs.setdefault("timeout", 300) + + r = await super().get(url, **kwargs) + + if b"This user has no access privileges defined." in r.content: + # This is a strange error that we began seeing in CM/ECF 6.3.1 at + # ILND. You can currently reproduce it by logging in on the central + # login page, selecting "Court Links" as your destination, and then + # loading: https://ecf.ilnd.uscourts.gov/cgi-bin/WrtOpRpt.pl + # The solution when this error shows up is to simply re-run the get + # request, so that's what we do here. PACER needs some frustrating + # and inelegant hacks sometimes. + r = await super().get(url, **kwargs) + if auto_login: + updated = await self._login_again(r) + if updated: + # Re-do the request with the new session. + r = await super().get(url, **kwargs) + # Do an additional check of the content returned. + await self._login_again(r) + return r + + async def get_unquoted(self, url, auto_login=True, **kwargs): + """Overrides httpx.AsyncClient.get with session retry logic. :param url: url string to GET :param auto_login: Whether the auto-login procedure should happen. - :return: requests.Response + :return: httpx.Response """ kwargs.setdefault("timeout", 300) - r = super().get(url, **kwargs) + request = self.build_request("GET", url=url, **kwargs) + request.url = UNQUOTED_URL(url) + r = await super().send(request) if b"This user has no access privileges defined." in r.content: # This is a strange error that we began seeing in CM/ECF 6.3.1 at @@ -144,19 +188,21 @@ def get(self, url, auto_login=True, **kwargs): # The solution when this error shows up is to simply re-run the get # request, so that's what we do here. PACER needs some frustrating # and inelegant hacks sometimes. - r = super().get(url, **kwargs) + r = await super().get(url, **kwargs) if auto_login: - updated = self._login_again(r) + updated = await self._login_again(r) if updated: # Re-do the request with the new session. - r = super().get(url, **kwargs) + request = self.build_request("GET", url=url, **kwargs) + request.url = UNQUOTED_URL(url) + r = await super().send(request) # Do an additional check of the content returned. - self._login_again(r) + await self._login_again(r) return r - def post(self, url, data=None, json=None, auto_login=True, **kwargs): + async def post(self, url, data=None, json=None, auto_login=True, **kwargs): """ - Overrides requests.Session.post with PACER-specific fun. + Overrides httpx.AsyncClient.post with PACER-specific fun. Will automatically convert data dict into proper multi-part form data and pass to the files parameter instead. @@ -169,7 +215,7 @@ def post(self, url, data=None, json=None, auto_login=True, **kwargs): :param json: json object to post :param auto_login: Whether the auto-login procedure should happen. :param kwargs: assorted keyword arguments - :return: requests.Response + :return: httpx.Response """ kwargs.setdefault("timeout", 300) @@ -179,24 +225,24 @@ def post(self, url, data=None, json=None, auto_login=True, **kwargs): else: kwargs.update({"data": data, "json": json}) - r = super().post(url, **kwargs) + r = await super().post(url, **kwargs) if auto_login: - updated = self._login_again(r) + updated = await self._login_again(r) if updated: # Re-do the request with the new session. - return super().post(url, **kwargs) + return await super().post(url, **kwargs) return r - def head(self, url, **kwargs): + async def head(self, url, **kwargs): """ - Overrides request.Session.head with a default timeout parameter. + Overrides httpx.AsyncClient.head with a default timeout parameter. :param url: url string upon which to do a HEAD request :param kwargs: assorted keyword arguments - :return: requests.Response + :return: httpx.Response """ kwargs.setdefault("timeout", 300) - return super().head(url, **kwargs) + return await super().head(url, **kwargs) @staticmethod def _prepare_multipart_form_data(data): @@ -226,7 +272,7 @@ def _get_view_state(r): id="j_id1:javax.faces.ViewState:0" value="some-long-value-here"> - :param r: A request.Response object + :param r: A httpx.Response object :return The value of the "value" attribute of the ViewState input element. """ @@ -262,7 +308,7 @@ def _get_xml_view_state(r): xpath = "//update[@id='j_id1:javax.faces.ViewState:0']/text()" return tree.xpath(xpath)[0] - def login(self, url=None): + async def login(self, url=None): """Attempt to log into the PACER site. The first step is to get an authentication token using a PACER username and password. @@ -295,19 +341,18 @@ def login(self, url=None): if self.client_code: data["clientCode"] = self.client_code - login_post_r = super().post( + login_post_r = await super().post( url, headers={ - "User-Agent": "Juriscraper", - "Content-type": "application/json", + "User-Agent": self.user_agent, "Accept": "application/json", }, timeout=60, - data=json.dumps(data), + json=data, ) - if login_post_r.status_code != requests.codes.ok: - message = f"Unable connect to PACER site: '{login_post_r.status_code}: {login_post_r.reason}'" + if login_post_r.status_code != httpx.codes.OK: + message = f"Unable connect to PACER site: '{login_post_r.status_code}: {login_post_r.reason_phrase}'" logger.warning(message) raise PacerLoginException(message) @@ -332,8 +377,7 @@ def login(self, url=None): "Did not get NextGenCSO cookie when attempting PACER login." ) # Set up cookie with 'nextGenCSO' token (128-byte string of characters) - session_cookies = requests.cookies.RequestsCookieJar() - session_cookies.set( + self.cookies.set( "NextGenCSO", response_json.get("nextGenCSO"), domain=".uscourts.gov", @@ -341,7 +385,7 @@ def login(self, url=None): ) # Support "CurrentGen" servers as well. This can be remoevd if they're # ever all upgraded to NextGen. - session_cookies.set( + self.cookies.set( "PacerSession", response_json.get("nextGenCSO"), domain=".uscourts.gov", @@ -350,22 +394,21 @@ def login(self, url=None): # If optional client code information is included, # 'PacerClientCode' cookie should be set if self.client_code: - session_cookies.set( + self.cookies.set( "PacerClientCode", self.client_code, domain=".uscourts.gov", path="/", ) - self.cookies = session_cookies logger.info("New PACER session established.") - def _do_additional_request(self, r: requests.Response) -> bool: + def _do_additional_request(self, r: httpx.Response) -> bool: """Check if we should do an additional request to PACER, sometimes PACER returns the login page even though cookies are still valid. Do an additional GET request if we haven't done it previously. See https://github.com/freelawproject/courtlistener/issues/2160. - :param r: The requests Response object. + :param r: The httpx Response object. :return: True if an additional request should be done, otherwise False. """ if r.request.method == "GET" and self.additional_request_done is False: @@ -373,7 +416,7 @@ def _do_additional_request(self, r: requests.Response) -> bool: return True return False - def _login_again(self, r): + async def _login_again(self, r): """Log into PACER if the session has credentials and the session has expired. @@ -396,7 +439,7 @@ def _login_again(self, r): logger.info( "Invalid/expired PACER session. Establishing new session." ) - self.login() + await self.login() return True else: if self._do_additional_request(r): diff --git a/juriscraper/pacer/internet_archive.py b/juriscraper/pacer/internet_archive.py index c4edb26a6..ff16ce7fd 100644 --- a/juriscraper/pacer/internet_archive.py +++ b/juriscraper/pacer/internet_archive.py @@ -1,5 +1,5 @@ +from httpx import AsyncClient, Timeout from lxml import etree -from requests import Session from ..lib.judge_parsers import normalize_judge_string from ..lib.log_tools import make_default_logger @@ -21,7 +21,7 @@ class InternetArchive(BaseDocketReport): CACHE_ATTRS = ["metadata", "parties", "docket_entries"] - def __init__(self, court_id): + def __init__(self, court_id, **kwargs): super().__init__(court_id) # Initialize the empty cache properties. @@ -29,25 +29,27 @@ def __init__(self, court_id): self._metadata = None self._parties = None self._docket_entries = None - - self.session = Session() + kwargs.setdefault("http2", True) + self.session = AsyncClient(**kwargs) self.response = None self.tree = None self.parser = etree.XMLParser(recover=True) self.is_valid = True - def __del__(self): + async def __aexit__(self): if self.session: - self.session.close() + await self.session.aclose() - def download_pdf(self, pacer_case_id, document_number, attachment_number): + async def download_pdf( + self, pacer_case_id, document_number, attachment_number + ): """Download a PDF from the Internet Archive""" - timeout = (60, 300) + timeout = Timeout(60, read=300) url = get_pdf_url( self.court_id, pacer_case_id, document_number, attachment_number ) logger.info("GETting PDF at URL: %s") - r = self.session.get(url, timeout=timeout) + r = await self.session.get(url, timeout=timeout) r.raise_for_status() if not is_pdf(r): logger.error(f"Got non-PDF data, but expected a PDF at: {url}") @@ -55,12 +57,12 @@ def download_pdf(self, pacer_case_id, document_number, attachment_number): else: return r - def query(self, pacer_case_id): + async def query(self, pacer_case_id): """Download a docket XML page from the Internet Archive""" - timeout = (60, 300) + timeout = Timeout(60, read=300) url = get_docketxml_url(self.court_id, pacer_case_id) logger.info("GETting docket XML at URL: %s") - r = self.session.get(url, timeout=timeout) + r = await self.session.get(url, timeout=timeout) self.response = r self.parse() diff --git a/juriscraper/pacer/list_of_creditors.py b/juriscraper/pacer/list_of_creditors.py index af6fd3296..592b46059 100644 --- a/juriscraper/pacer/list_of_creditors.py +++ b/juriscraper/pacer/list_of_creditors.py @@ -1,7 +1,6 @@ from typing import Optional -import requests -from requests import Response +from httpx import Request, Response, Timeout from juriscraper.pacer.reports import BaseReport @@ -73,7 +72,7 @@ def _get_valid_post_param(self) -> str: param = action_value.split("?")[1] return param - def download_file(self) -> Optional[Response]: + async def download_file(self) -> Optional[Response]: """Downloads the formated pipe-limited file using the FORMAT_RAW_DATA_SERVICE API. @@ -86,15 +85,19 @@ def download_file(self) -> Optional[Response]: "useragentstring": "CM/ECF-BK V10.6.4", "data": self._metadata["data"], } - req_timeout = (60, 300) - r = requests.post( - self.FORMAT_RAW_DATA_SERVICE, params=params, timeout=req_timeout + req_timeout = Timeout(60, read=300) + request = Request( + "POST", + self.FORMAT_RAW_DATA_SERVICE, + params=params, + extensions=dict(timeout=req_timeout.as_dict()), ) + r = await self.session.send(request) if "text/plain" in r.headers.get("content-type"): return r return None - def query_post_param(self) -> str: + async def query_post_param(self) -> str: """To query the report and ensure that the cost is the same as in the browser, obtain a valid POST param 'x-L_1_0-1' from the input form. @@ -105,11 +108,11 @@ def query_post_param(self) -> str: ), "session attribute of ListOfCreditors cannot be None." logger.info(f"Getting a valid POST param for '{self.court_id}'") - self.response = self.session.get(self.url) + self.response = await self.session.get(self.url) self.parse() return self._get_valid_post_param() - def query( + async def query( self, pacer_case_id: str, docket_number: str, @@ -141,7 +144,7 @@ def query( self.court_id, params, ) - self.response = self.session.post( + self.response = await self.session.post( f"{self.url}?{post_param}", data=params ) self.parse() diff --git a/juriscraper/pacer/mobile_query.py b/juriscraper/pacer/mobile_query.py index 9a53a487b..226a56129 100644 --- a/juriscraper/pacer/mobile_query.py +++ b/juriscraper/pacer/mobile_query.py @@ -76,7 +76,7 @@ def metadata(self): self._metadata = data return data - def query(self, pacer_case_id): + async def query(self, pacer_case_id): """Use a district court's PACER mobile query function with a known case id At the top of every district PACER court, there's a button that says, @@ -119,7 +119,7 @@ def query(self, pacer_case_id): pacer_case_id, self.court_id, ) - self.response = self.session.post( + self.response = await self.session.post( f"{self.url}?search=caseInfo&caseid=={pacer_case_id}" ) self.parse() diff --git a/juriscraper/pacer/reports.py b/juriscraper/pacer/reports.py index fdba70e28..de5fbdb42 100644 --- a/juriscraper/pacer/reports.py +++ b/juriscraper/pacer/reports.py @@ -2,9 +2,8 @@ from typing import Optional, Tuple from urllib.parse import urljoin -import requests +from httpx import Request, Response, Timeout from lxml.html import HtmlElement -from requests import Response from ..lib.html_utils import ( clean_html, @@ -75,7 +74,7 @@ def query(self, *args, **kwargs): raise NotImplementedError(".query() must be overridden") def parse(self): - """Parse the data provided in a requests.response object and set + """Parse the data provided in a httpx.Response object and set self.tree to be an lxml etree. In most cases, you won't need to call this since it will be automatically called by self.query, if needed. @@ -134,9 +133,9 @@ def data(self): """Extract the data from the tree and return it.""" raise NotImplementedError(".data() must be overridden.") - def _query_pdf_download( + async def _query_pdf_download( self, - pacer_case_id: str, + pacer_case_id: Optional[str], pacer_doc_id: str, pacer_magic_num: Optional[str], got_receipt: str, @@ -181,15 +180,15 @@ def _query_pdf_download( if pacer_magic_num is not None: data["magic_num"] = pacer_magic_num - timeout = (60, 300) + timeout = Timeout(60, read=300) logger.info(f"POSTing URL: {url} with params: {data}") - r = self.session.post(url, data=data, timeout=timeout) + r = await self.session.post(url, data=data, timeout=timeout) return r, url - def download_pdf( + async def download_pdf( self, pacer_case_id: str, - pacer_doc_id: int, + pacer_doc_id: str, pacer_magic_num: Optional[str] = None, appellate: bool = False, ) -> Tuple[Optional[Response], str]: @@ -201,6 +200,7 @@ def download_pdf( one can be found (is not sealed, gone, etc.). And a string indicating the error message, if there is one or else an empty string. """ + req_timeout = Timeout(60, read=300) if pacer_magic_num: # If magic_number is available try to download the # document anonymously by its magic link @@ -223,8 +223,13 @@ def download_pdf( } # Add parameters to the PACER base url and make a GET request - req_timeout = (60, 300) - r = requests.get(url, params=params, timeout=req_timeout) + request = Request( + "GET", + url, + params=params, + extensions=dict(timeout=req_timeout.as_dict()), + ) + r = await self.session.send(request) # If the response is an HTML document, and it doesn't contain an # IFRAME, the magic link document is no longer available @@ -241,7 +246,7 @@ def download_pdf( else: # If no magic_number use normal method to fetch the document - r, url = self._query_pdf_download( + r, url = await self._query_pdf_download( pacer_case_id, pacer_doc_id, pacer_magic_num, got_receipt="1" ) @@ -251,7 +256,7 @@ def download_pdf( # this docket. Probably a criminal case with the doppelganger # bug. Try again, but do so without the pacer_case_id. # This should work, but will omit the blue header on the PDFs. - r, url = self._query_pdf_download( + r, url = await self._query_pdf_download( None, pacer_doc_id, pacer_magic_num, got_receipt="1" ) @@ -323,7 +328,7 @@ def download_pdf( m = redirect_re.search(r.content) if m is not None: redirect_url = m.group(1).decode("utf-8") - r = self.session.get(urljoin(url, redirect_url)) + r = await self.session.get(urljoin(url, redirect_url)) r.raise_for_status() # The request above sometimes generates an HTML page with an iframe @@ -337,7 +342,7 @@ def download_pdf( text = clean_html(r.text) tree = get_html_parsed_text(text) - tree.rewrite_links(fix_links_in_lxml_tree, base_href=r.url) + tree.rewrite_links(fix_links_in_lxml_tree, base_href=str(r.url)) try: iframe_src = tree.xpath("//iframe/@src")[0] except IndexError: @@ -360,10 +365,15 @@ def download_pdf( if pacer_magic_num: # If magic_number is available try to download the # document anonymously from iframe_src - r = requests.get(iframe_src, timeout=req_timeout) + request = Request( + "GET", + iframe_src, + extensions=dict(timeout=req_timeout.as_dict()), + ) + r = await self.session.send(request) else: # Use PACER session to fetch the document from iframe_src - r = self.session.get(iframe_src) + r = await self.session.get_unquoted(iframe_src) if is_pdf(r): logger.info( f"Got iframed PDF data for case {url} at: {iframe_src}" @@ -371,11 +381,13 @@ def download_pdf( return r, "" - def is_pdf_sealed(self, pacer_case_id, pacer_doc_id, pacer_magic_num=None): + async def is_pdf_sealed( + self, pacer_case_id, pacer_doc_id, pacer_magic_num=None + ): """Check if a PDF is sealed without trying to actually download it. """ - r, url = self._query_pdf_download( + r, url = await self._query_pdf_download( pacer_case_id, pacer_doc_id, pacer_magic_num, got_receipt="0" ) sealed = "You do not have permission to view this document." diff --git a/juriscraper/pacer/rss_feeds.py b/juriscraper/pacer/rss_feeds.py index d01e5259e..f808eb0a5 100644 --- a/juriscraper/pacer/rss_feeds.py +++ b/juriscraper/pacer/rss_feeds.py @@ -6,7 +6,7 @@ from html import unescape import feedparser -from requests import Session +from httpx import AsyncClient, Timeout from ..lib.log_tools import make_default_logger from ..lib.string_utils import clean_string, harmonize @@ -131,11 +131,12 @@ class PacerRssFeed(DocketReport): CACHE_ATTRS = ["data"] - def __init__(self, court_id): + def __init__(self, court_id, **kwargs): super().__init__(court_id) self._clear_caches() self._data = None - self.session = Session() + kwargs.setdefault("http2", True) + self.session = AsyncClient(**kwargs) self.is_valid = True self.is_appellate = False if self.court_id[-1].isdigit() or self.court_id in [ @@ -149,9 +150,9 @@ def __init__(self, court_id): else: self.is_bankruptcy = False - def __del__(self): + async def __aexit__(self): if self.session: - self.session.close() + await self.session.aclose() @property def url(self): @@ -173,10 +174,10 @@ def url(self): else: return f"https://ecf.{self.court_id}.uscourts.gov/{self.PATH}" - def query(self): + async def query(self): """Query the RSS feed for a given court ID - Note that we use requests here, and so we forgo some of the + Note that we use httpx here, and so we forgo some of the useful features that feedparser offers around the Etags and Last-Modified headers. This is fine for now because no PACER site seems to support these headers, but eventually we'll @@ -194,8 +195,8 @@ def query(self): # outages cause us grief. Too short and slow courts don't get done. # Previously, this value has been (60, 300), then 5. Hopefully the # below is a reasonable middle ground. - timeout = (5, 20) - self.response = self.session.get(self.url, timeout=timeout) + timeout = Timeout(5, read=20) + self.response = await self.session.get(self.url, timeout=timeout) def parse(self): self._clear_caches() diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py index 09cc7e522..d9d9549d3 100644 --- a/juriscraper/pacer/utils.py +++ b/juriscraper/pacer/utils.py @@ -2,7 +2,7 @@ from datetime import date, datetime from typing import Dict, Optional, Union -import requests +import httpx import tldextract from dateutil import parser from dateutil.tz import gettz @@ -430,9 +430,13 @@ def get_doc_id_prefix_from_court_id(court_id): return cid_to_prefix_map[court_id] -def get_pacer_court_info(): - r = requests.get("https://court-version-scraper.fly.dev/courts.json") - return r.json() +async def get_pacer_court_info(**kwargs): + kwargs.setdefault("http2", True) + async with httpx.AsyncClient(**kwargs) as client: + r = await client.get( + "https://court-version-scraper.fly.dev/courts.json" + ) + return r.json() def get_courts_from_json(j): diff --git a/requirements.txt b/requirements.txt index 2fd891519..8e5e0e91d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,6 @@ geonamescache==1.6.0 html5lib lxml~=4.9 python-dateutil==2.8.2 -requests>=2.20.0 +httpx[http2]>=0.25.0 selenium>=4.9.1 tldextract diff --git a/tests/local/test_PacerNeedLoginTest.py b/tests/local/test_PacerNeedLoginTest.py index 5891e26a3..ab5fbe1f8 100644 --- a/tests/local/test_PacerNeedLoginTest.py +++ b/tests/local/test_PacerNeedLoginTest.py @@ -9,7 +9,7 @@ import unittest from unittest import mock -from requests import Request +from httpx import Request from juriscraper.lib.exceptions import PacerLoginException from juriscraper.lib.test_utils import MockResponse, warn_or_crash_slow_parser @@ -17,7 +17,7 @@ from tests import TESTS_ROOT_EXAMPLES_PACER -class PacerNeedLoginTest(unittest.TestCase): +class PacerNeedLoginTest(unittest.IsolatedAsyncioTestCase): """Test if different pages require a log in.""" def parse_files(self, path_root, file_ext): @@ -60,19 +60,19 @@ def test_parsing_auth_samples(self): self.parse_files(path_root, "*.html") @mock.patch( - "juriscraper.pacer.http.requests.Session.get", + "juriscraper.pacer.http.httpx.AsyncClient.get", side_effect=lambda *args, **kwargs: MockResponse( 200, b"", headers={"content-type": "text/html"}, - request=Request(method="GET"), + request=Request(method="GET", url="https://example.com"), ), ) @mock.patch( "juriscraper.pacer.http.check_if_logged_in_page", side_effect=lambda x: False, ) - def test_do_an_additional_get_request( + async def test_do_an_additional_get_request( self, mock_get, mock_check_if_logged_in ): """Test if we can do an additional GET request after check_if_logged_in @@ -81,24 +81,24 @@ def test_do_an_additional_get_request( """ session = PacerSession(username="", password="") with self.assertRaises(PacerLoginException): - session.get("https://example.com") + await session.get("https://example.com") self.assertEqual(mock_check_if_logged_in.call_count, 2) @mock.patch( - "juriscraper.pacer.http.requests.Session.post", + "juriscraper.pacer.http.httpx.AsyncClient.post", side_effect=lambda *args, **kwargs: MockResponse( 200, b"", headers={"content-type": "text/html"}, - request=Request(method="POST"), + request=Request(method="POST", url="https://example.com"), ), ) @mock.patch( "juriscraper.pacer.http.check_if_logged_in_page", side_effect=lambda x: False, ) - def test_avoid_an_additional_post_request( + async def test_avoid_an_additional_post_request( self, mock_get, mock_check_if_logged_in ): """Test if we can avoid an additional POST request after @@ -107,10 +107,10 @@ def test_avoid_an_additional_post_request( """ session = PacerSession(username="", password="") with self.assertRaises(PacerLoginException): - session.post("https://example.com") + await session.post("https://example.com") self.assertEqual(mock_check_if_logged_in.call_count, 1) - @mock.patch("juriscraper.pacer.http.requests.Session.get") + @mock.patch("juriscraper.pacer.http.httpx.AsyncClient.get") @mock.patch( "juriscraper.pacer.http.check_if_logged_in_page", side_effect=lambda x: False, @@ -119,12 +119,12 @@ def test_avoid_an_additional_post_request( "juriscraper.pacer.http.is_pdf", side_effect=lambda x: True, ) - def test_avoid_an_additional_get_request_pdf( + async def test_avoid_an_additional_get_request_pdf( self, mock_get, mock_check_if_logged_in, mock_is_pdf ): """Test if we can avoid an additional GET requests if a PDF binary is returned, check_if_logged_in_page shouldn't be called. """ session = PacerSession(username="", password="") - session.get("https://example.com") + await session.get("https://example.com") self.assertEqual(mock_check_if_logged_in.call_count, 0) diff --git a/tests/local/test_PacerSessionTest.py b/tests/local/test_PacerSessionTest.py index 9fa3b10bb..786a19649 100644 --- a/tests/local/test_PacerSessionTest.py +++ b/tests/local/test_PacerSessionTest.py @@ -5,7 +5,7 @@ from tests.network import get_pacer_session -class PacerSessionTest(unittest.TestCase): +class PacerSessionTest(unittest.IsolatedAsyncioTestCase): """Test the PacerSession wrapper class""" def setUp(self): @@ -20,15 +20,17 @@ def test_data_transformation(self): output = self.session._prepare_multipart_form_data(data) self.assertEqual(output, expected) - @mock.patch("juriscraper.pacer.http.requests.Session.post") - def test_ignores_non_data_posts(self, mock_post): + @mock.patch("juriscraper.pacer.http.httpx.AsyncClient.post") + async def test_ignores_non_data_posts(self, mock_post): """Test that POSTs without a data parameter just pass through as normal. :param mock_post: mocked Session.post method """ data = {"name": ("filename", "junk")} - self.session.post("https://free.law", files=data, auto_login=False) + await self.session.post( + "https://free.law", files=data, auto_login=False + ) self.assertTrue( mock_post.called, "request.Session.post should be called" @@ -39,8 +41,8 @@ def test_ignores_non_data_posts(self, mock_post): "the data should not be changed if using a files call", ) - @mock.patch("juriscraper.pacer.http.requests.Session.post") - def test_transforms_data_on_post(self, mock_post): + @mock.patch("juriscraper.pacer.http.httpx.AsyncClient.post") + async def test_transforms_data_on_post(self, mock_post): """Test that POSTs using the data parameter get transformed into PACER's delightfully odd multi-part form data. @@ -49,7 +51,9 @@ def test_transforms_data_on_post(self, mock_post): data = {"name": "dave", "age": 33} expected = {"name": (None, "dave"), "age": (None, 33)} - self.session.post("https://free.law", data=data, auto_login=False) + await self.session.post( + "https://free.law", data=data, auto_login=False + ) self.assertTrue( mock_post.called, "request.Session.post should be called" @@ -65,9 +69,9 @@ def test_transforms_data_on_post(self, mock_post): "we should transform and populate the files argument", ) - @mock.patch("juriscraper.pacer.http.requests.Session.post") - def test_sets_default_timeout(self, mock_post): - self.session.post("https://free.law", data={}, auto_login=False) + @mock.patch("juriscraper.pacer.http.httpx.AsyncClient.post") + async def test_sets_default_timeout(self, mock_post): + await self.session.post("https://free.law", data={}, auto_login=False) self.assertTrue( mock_post.called, "request.Session.post should be called" diff --git a/tests/network/test_PacerAuthTest.py b/tests/network/test_PacerAuthTest.py index 233497b4e..d0714f112 100644 --- a/tests/network/test_PacerAuthTest.py +++ b/tests/network/test_PacerAuthTest.py @@ -7,14 +7,14 @@ from tests.network import SKIP_IF_NO_PACER_LOGIN, get_pacer_session -class PacerAuthTest(unittest.TestCase): +class PacerAuthTest(unittest.IsolatedAsyncioTestCase): """Test the authentication methods""" @SKIP_IF_NO_PACER_LOGIN - def test_logging_into_pacer(self): + async def test_logging_into_pacer(self): try: self.session = get_pacer_session() - self.session.login() + await self.session.login() self.assertIsNotNone(self.session) self.assertIsNotNone( self.session.cookies.get( @@ -24,8 +24,8 @@ def test_logging_into_pacer(self): except PacerLoginException: self.fail("Could not log into PACER") - def test_logging_in_bad_credentials(self): + async def test_logging_in_bad_credentials(self): """Make sure if username/password is incorrect an exception is throw""" session = PacerSession(username="foofoo", password="barbarbar") with self.assertRaises(PacerLoginException): - session.login() + await session.login() diff --git a/tests/network/test_PacerCaseQueryTest.py b/tests/network/test_PacerCaseQueryTest.py index ab605e5da..7a2b660bf 100644 --- a/tests/network/test_PacerCaseQueryTest.py +++ b/tests/network/test_PacerCaseQueryTest.py @@ -9,19 +9,19 @@ from tests.network import SKIP_IF_NO_PACER_LOGIN, get_pacer_session -class PacerCaseQueryTest(unittest.TestCase): +class PacerCaseQueryTest(unittest.IsolatedAsyncioTestCase): """A test of basic info for the Case Query""" - def setUp(self): + async def asyncSetUp(self): self.session = get_pacer_session() - self.session.login() + await self.session.login() self.report = CaseQuery("cand", self.session) self.pacer_case_id = "186730" # 4:06-cv-07294 Foley v. Bates @SKIP_IF_NO_PACER_LOGIN - def test_query(self): + async def test_query(self): """Can we get the basic info?""" - self.report.query(self.pacer_case_id) + await self.report.query(self.pacer_case_id) self.assertIn( "Foley v. Bates", self.report.response.text, diff --git a/tests/network/test_PacerDocketReportTest.py b/tests/network/test_PacerDocketReportTest.py index 6e004d6d3..5b40e0a9d 100644 --- a/tests/network/test_PacerDocketReportTest.py +++ b/tests/network/test_PacerDocketReportTest.py @@ -9,12 +9,12 @@ from tests.network import SKIP_IF_NO_PACER_LOGIN, get_pacer_session -class PacerDocketReportTest(unittest.TestCase): +class PacerDocketReportTest(unittest.IsolatedAsyncioTestCase): """A variety of tests for the docket report""" - def setUp(self): + async def asyncSetUp(self): self.session = get_pacer_session() - self.session.login() + await self.session.login() self.report = DocketReport("cand", self.session) self.pacer_case_id = "186730" # 4:06-cv-07294 Foley v. Bates @@ -29,16 +29,18 @@ def _count_rows(html): return len(tree.xpath("//table[./tr/td[3]]/tr")) - 1 # No header row @SKIP_IF_NO_PACER_LOGIN - def test_queries(self): + async def test_queries(self): """Do a variety of queries work?""" - self.report.query(self.pacer_case_id) + await self.report.query(self.pacer_case_id) self.assertIn( "Foley v. Bates", self.report.response.text, msg="Super basic query failed", ) - self.report.query(self.pacer_case_id, date_start=date(2007, 11, 1)) + await self.report.query( + self.pacer_case_id, date_start=date(2007, 11, 1) + ) row_count = self._count_rows(self.report.response.text) self.assertEqual( 2, @@ -48,7 +50,7 @@ def test_queries(self): "date. Got %s." % row_count, ) - self.report.query( + await self.report.query( self.pacer_case_id, date_start=date(2007, 11, 1), date_end=date(2007, 11, 28), @@ -62,7 +64,9 @@ def test_queries(self): "end dates. Got %s." % row_count, ) - self.report.query(self.pacer_case_id, doc_num_start=5, doc_num_end=5) + await self.report.query( + self.pacer_case_id, doc_num_start=5, doc_num_end=5 + ) row_count = self._count_rows(self.report.response.text) self.assertEqual( 1, @@ -72,7 +76,7 @@ def test_queries(self): "%s" % row_count, ) - self.report.query( + await self.report.query( self.pacer_case_id, date_start=date(2007, 11, 1), date_end=date(2007, 11, 28), @@ -88,7 +92,7 @@ def test_queries(self): "Entered. Got %s" % row_count, ) - self.report.query( + await self.report.query( self.pacer_case_id, doc_num_start=500, show_parties_and_counsel=True, @@ -98,7 +102,7 @@ def test_queries(self): self.report.response.text, msg="Didn't find party info when it was explicitly " "requested.", ) - self.report.query( + await self.report.query( self.pacer_case_id, doc_num_start=500, show_parties_and_counsel=False, @@ -110,18 +114,18 @@ def test_queries(self): ) @SKIP_IF_NO_PACER_LOGIN - def test_using_same_report_twice(self): + async def test_using_same_report_twice(self): """Do the caches get properly nuked between runs? See issue #187. """ # Query the first one... - self.report.query(self.pacer_case_id) + await self.report.query(self.pacer_case_id) d = self.report.data.copy() # Then the second one... second_pacer_case_id = "63111" # 1:07-cv-00035-RJA-HKS Anson v. USA - self.report.query(second_pacer_case_id) + await self.report.query(second_pacer_case_id) d2 = self.report.data.copy() self.assertNotEqual( d, diff --git a/tests/network/test_PacerFreeOpinionsTest.py b/tests/network/test_PacerFreeOpinionsTest.py index d5d6f0224..dddf6532a 100644 --- a/tests/network/test_PacerFreeOpinionsTest.py +++ b/tests/network/test_PacerFreeOpinionsTest.py @@ -22,16 +22,16 @@ ) -class PacerFreeOpinionsTest(unittest.TestCase): +class PacerFreeOpinionsTest(unittest.IsolatedAsyncioTestCase): """A variety of tests relating to the Free Written Opinions report""" - def setUp(self): + async def asyncSetUp(self): pacer_session = PacerSession() if pacer_credentials_are_defined(): # CAND chosen at random pacer_session = get_pacer_session() - pacer_session.login() + await pacer_session.login() with open(os.path.join(JURISCRAPER_ROOT, "pacer/courts.json")) as j: self.courts = get_courts_from_json(json.load(j)) @@ -112,59 +112,59 @@ def test_extract_written_documents_report(self): self.assertEqual(r.headers["Content-Type"], "application/pdf") @SKIP_IF_NO_PACER_LOGIN - def test_download_simple_pdf(self): + async def test_download_simple_pdf(self): """Can we download a PDF document returned directly?""" report = self.reports["alnb"] - r, msg = report.download_pdf("602431", "018129511556") + r, msg = await report.download_pdf("602431", "018129511556") self.assertEqual(r.headers["Content-Type"], "application/pdf") @SKIP_IF_NO_PACER_LOGIN - def test_download_redirected_pdf(self): + async def test_download_redirected_pdf(self): """Can we download a PDF document returned after a redirection?""" report = self.reports["azd"] - r, msg = report.download_pdf("1311031", "025125636132") + r, msg = await report.download_pdf("1311031", "025125636132") self.assertEqual(r.headers["Content-Type"], "application/pdf") @SKIP_IF_NO_PACER_LOGIN - def test_download_iframed_pdf(self): + async def test_download_iframed_pdf(self): """Can we download a PDF document returned in IFrame?""" report = self.reports["vib"] - r, msg = report.download_pdf("1507", "1921141093") + r, msg = await report.download_pdf("1507", "1921141093") self.assertEqual(r.headers["Content-Type"], "application/pdf") @SKIP_IF_NO_PACER_LOGIN - def test_download_unavailable_pdf(self): + async def test_download_unavailable_pdf(self): """Do we throw an error if the item is unavailable?""" # 5:11-cr-40057-JAR, document 3 report = self.reports["ksd"] - r, msg = report.download_pdf("81531", "07902639735") + r, msg = await report.download_pdf("81531", "07902639735") self.assertIsNone(r) self.assertIn("Document not available in case", msg) @SKIP_IF_NO_PACER_LOGIN - def test_query_can_get_multiple_results(self): + async def test_query_can_get_multiple_results(self): """ Can we run a query that gets multiple rows and parse them all? """ court_id = "paeb" report = self.reports[court_id] some_date = convert_date_string(self.valid_dates[court_id]) - report.query(some_date, some_date, sort="case_number") + await report.query(some_date, some_date, sort="case_number") self.assertEqual(3, len(report.data), "should get 3 responses for ksb") @SKIP_IF_NO_PACER_LOGIN - def test_query_using_last_good_row(self): + async def test_query_using_last_good_row(self): """ Can we run a query that triggers no content in first cell? """ court_id = "ksb" report = self.reports[court_id] some_date = convert_date_string(self.valid_dates[court_id]) - report.query(some_date, some_date, sort="case_number") + await report.query(some_date, some_date, sort="case_number") self.assertEqual(2, len(report.data), "should get 2 response for ksb") @SKIP_IF_NO_PACER_LOGIN - def test_ordering_by_date_filed(self): + async def test_ordering_by_date_filed(self): """Can we change the ordering?""" # First try both orderings in areb (where things have special cases) and # ded (Delaware) where things are more normal. @@ -172,20 +172,20 @@ def test_ordering_by_date_filed(self): for test in tests: report = self.reports[test["court"]] some_date = convert_date_string(self.valid_dates[test["court"]]) - report.query(some_date, some_date, sort="date_filed") + await report.query(some_date, some_date, sort="date_filed") self.assertEqual( test["count"], len(report.data), f"Should get {test['count']} response for {test['court']}", ) - report.query(some_date, some_date, sort="case_number") + await report.query(some_date, some_date, sort="case_number") self.assertEqual( test["count"], len(report.data), f"should get {test['count']} response for {test['court']}", ) - def test_catch_excluded_court_ids(self): + async def test_catch_excluded_court_ids(self): """Do we properly catch and prevent a query against disused courts?""" mock_session = mock.MagicMock() @@ -194,7 +194,7 @@ def test_catch_excluded_court_ids(self): some_date = convert_date_string("1/1/2015") - report.query(some_date, some_date, sort="case_number") + await report.query(some_date, some_date, sort="case_number") self.assertEqual([], report.responses, "should have empty result set") self.assertFalse( mock_session.post.called, msg="should not trigger a POST query" @@ -202,15 +202,15 @@ def test_catch_excluded_court_ids(self): @mock.patch("juriscraper.pacer.reports.logger") -class PacerMagicLinkTest(unittest.TestCase): +class PacerMagicLinkTest(unittest.IsolatedAsyncioTestCase): """Test related to PACER magic link free download""" - def setUp(self): + async def asyncSetUp(self): pacer_session = PacerSession() if pacer_credentials_are_defined(): # CAND chosen at random pacer_session = get_pacer_session() - pacer_session.login() + await pacer_session.login() self.reports = {} court_id = "nysd" @@ -220,7 +220,7 @@ def setUp(self): court_id_nda, pacer_session ) - def test_download_simple_pdf_magic_link_fails(self, mock_logger): + async def test_download_simple_pdf_magic_link_fails(self, mock_logger): """Can we download a PACER document with an invalid or expired magic link? land on a login page and returns an error. """ @@ -229,7 +229,7 @@ def test_download_simple_pdf_magic_link_fails(self, mock_logger): pacer_case_id = "568350" pacer_doc_id = "127130869087" pacer_magic_num = "46253052" - r, msg = report.download_pdf( + r, msg = await report.download_pdf( pacer_case_id, pacer_doc_id, pacer_magic_num ) mock_logger.warning.assert_called_with( @@ -240,7 +240,7 @@ def test_download_simple_pdf_magic_link_fails(self, mock_logger): # No PDF should be returned self.assertEqual(r, None) - def test_download_nda_pdf_magic_link(self, mock_logger): + async def test_download_nda_pdf_magic_link(self, mock_logger): """Can we download a NDA PACER document with an invalid or expired magic link? land on a login page and returns an error. """ @@ -250,7 +250,7 @@ def test_download_nda_pdf_magic_link(self, mock_logger): pacer_doc_id = "003014193380" pacer_magic_num = "3594681a19879633" appellate = True - r, msg = report.download_pdf( + r, msg = await report.download_pdf( pacer_case_id, pacer_doc_id, pacer_magic_num, appellate ) mock_logger.warning.assert_called_with( @@ -262,12 +262,12 @@ def test_download_nda_pdf_magic_link(self, mock_logger): self.assertEqual(r, None) -class PacerDownloadConfirmationPageTest(unittest.TestCase): +class PacerDownloadConfirmationPageTest(unittest.IsolatedAsyncioTestCase): """A variety of tests for the download confirmation page""" - def setUp(self): + async def asyncSetUp(self): self.session = get_pacer_session() - self.session.login() + await self.session.login() self.report = DownloadConfirmationPage("ca8", self.session) self.report_att = DownloadConfirmationPage("ca5", self.session) self.report_pdf = DownloadConfirmationPage("ca11", self.session) @@ -283,10 +283,10 @@ def setUp(self): self.pacer_doc_id_nef = "035022812318" @SKIP_IF_NO_PACER_LOGIN - def test_get_document_number(self): + async def test_get_document_number(self): """Can we get the PACER document number from a download confirmation page?""" - self.report.query(self.pacer_doc_id) + await self.report.query(self.pacer_doc_id) data_report = self.report.data self.assertEqual(data_report["document_number"], "00812590792") self.assertEqual(data_report["docket_number"], "14-3066") @@ -295,10 +295,10 @@ def test_get_document_number(self): self.assertEqual(data_report["document_description"], "PDF Document") @SKIP_IF_NO_PACER_LOGIN - def test_get_document_number_skipping_attachment_page(self): + async def test_get_document_number_skipping_attachment_page(self): """Can we get the PACER document number from a download confirmation page skipping the attachment page?""" - self.report_att.query(self.pacer_doc_id_att) + await self.report_att.query(self.pacer_doc_id_att) data_report = self.report_att.data self.assertEqual(data_report["document_number"], "45-1") self.assertEqual(data_report["docket_number"], "22-30311") @@ -307,26 +307,26 @@ def test_get_document_number_skipping_attachment_page(self): self.assertEqual(data_report["document_description"], "PDF Document") @SKIP_IF_NO_PACER_LOGIN - def test_no_confirmation_page(self): + async def test_no_confirmation_page(self): """If the download confirmation page is not available an empty dictionary is returned""" - self.report.query(self.no_confirmation_page_pacer_doc_id) + await self.report.query(self.no_confirmation_page_pacer_doc_id) data_report = self.report.data self.assertEqual(data_report, {}) @SKIP_IF_NO_PACER_LOGIN - def test_no_confirmation_page_pdf_returned(self): + async def test_no_confirmation_page_pdf_returned(self): """If the download confirmation page is not available when the PDF is returned directly, no valid page to parse.""" - self.report_pdf.query(self.pacer_doc_id_pdf) + await self.report_pdf.query(self.pacer_doc_id_pdf) data_report = self.report_pdf.data self.assertEqual(data_report, {}) @SKIP_IF_NO_PACER_LOGIN - def test_confirmation_page_pdf_district(self): + async def test_confirmation_page_pdf_district(self): """Can we get the PACER document number from a district download confirmation page?""" - self.report_nef.query(self.pacer_doc_id_nef) + await self.report_nef.query(self.pacer_doc_id_nef) data_report = self.report_nef.data self.assertEqual(data_report["document_number"], None) self.assertEqual(data_report["docket_number"], "3:18-cv-04865-EMC") @@ -335,10 +335,10 @@ def test_confirmation_page_pdf_district(self): self.assertEqual(data_report["document_description"], "Image670-0") @SKIP_IF_NO_PACER_LOGIN - def test_no_confirmation_page_pdf_returned_district(self): + async def test_no_confirmation_page_pdf_returned_district(self): """If the district download confirmation page is not available an empty dictionary is returned""" - self.report_nef_no_confirmation.query( + await self.report_nef_no_confirmation.query( self.pacer_doc_id_nef_no_confirmation ) data_report = self.report_nef_no_confirmation.data diff --git a/tests/network/test_PacerMobileQueryTest.py b/tests/network/test_PacerMobileQueryTest.py index 24e687e00..a2dc3d369 100644 --- a/tests/network/test_PacerMobileQueryTest.py +++ b/tests/network/test_PacerMobileQueryTest.py @@ -7,18 +7,18 @@ from tests.network import SKIP_IF_NO_PACER_LOGIN, get_pacer_session -class PacerMobileQueryTest(unittest.TestCase): +class PacerMobileQueryTest(unittest.IsolatedAsyncioTestCase): """A test of basic info for the Mobile Query""" - def setUp(self): + async def asyncSetUp(self): self.session = get_pacer_session() - self.session.login() + await self.session.login() self.report = MobileQuery("cand", self.session) self.pacer_case_id = "186730" # Foley v. Bates @SKIP_IF_NO_PACER_LOGIN - def test_query(self): - self.report.query(self.pacer_case_id) + async def test_query(self): + await self.report.query(self.pacer_case_id) # Can we get the docket entry count metadata = self.report.metadata diff --git a/tests/network/test_PacerSessionTest.py b/tests/network/test_PacerSessionTest.py index 5ae7bcc3d..2afd9db2f 100644 --- a/tests/network/test_PacerSessionTest.py +++ b/tests/network/test_PacerSessionTest.py @@ -10,7 +10,7 @@ ) -class PacerSessionTest(unittest.TestCase): +class PacerSessionTest(unittest.IsolatedAsyncioTestCase): """Test the PacerSession wrapper class""" def setUp(self): @@ -18,7 +18,7 @@ def setUp(self): @mock.patch("juriscraper.pacer.http.PacerSession.login") @SKIP_IF_NO_PACER_LOGIN - def test_auto_login(self, mock_login): + async def test_auto_login(self, mock_login): """Do we automatically log in if needed?""" court_id = "ksd" pacer_doc_id = "07902639735" @@ -27,13 +27,13 @@ def test_auto_login(self, mock_login): # This triggers and auto-login because we aren't logged in yet. self.session.username = PACER_USERNAME self.session.password = PACER_PASSWORD - _ = self.session.get( + _ = await self.session.get( url, params={ "case_id": pacer_case_id, "got_receipt": "1", }, - allow_redirects=True, + follow_redirects=True, ) self.assertTrue( mock_login.called, "PacerSession.login() should be called." diff --git a/tests/network/test_PacerShowCaseDocApiTest.py b/tests/network/test_PacerShowCaseDocApiTest.py index 7ece8623d..6e91f251d 100644 --- a/tests/network/test_PacerShowCaseDocApiTest.py +++ b/tests/network/test_PacerShowCaseDocApiTest.py @@ -11,14 +11,14 @@ ) -class PacerShowCaseDocApiTest(unittest.TestCase): +class PacerShowCaseDocApiTest(unittest.IsolatedAsyncioTestCase): @classmethod def setUpClass(cls): if pacer_credentials_are_defined(): cls.report = ShowCaseDocApi("dcd", get_pacer_session()) @SKIP_IF_NO_PACER_LOGIN - def test_queries(self): + async def test_queries(self): """Can we do basic queries?""" tests = ( # A regular document @@ -41,7 +41,7 @@ def test_queries(self): ), ) for test, expected in tests: - self.report.query(**test) + await self.report.query(**test) got = self.report.data self.assertEqual( got,