Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate http client from requests to httpx async client #739

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 14 additions & 12 deletions juriscraper/AbstractSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from datetime import date, datetime

import certifi
import requests
import httpx

from juriscraper.lib.date_utils import fix_future_year_typo, json_date_handler
from juriscraper.lib.exceptions import InsanityException
Expand Down Expand Up @@ -33,7 +33,7 @@ class AbstractSite:
Should not contain lists that can't be sorted by the _date_sort function.
"""

def __init__(self, cnt=None):
def __init__(self, cnt=None, user_agent="Juriscraper", **kwargs):
super().__init__()

# Computed metadata
Expand All @@ -44,10 +44,12 @@ def __init__(self, cnt=None):
self.downloader_executed = False
self.cookies = {}
self.cnt = cnt or CaseNameTweaker()
self.user_agent = user_agent
kwargs.setdefault("http2", True)
self.request = {
"verify": certifi.where(),
"session": requests.session(),
"headers": {"User-Agent": "Juriscraper"},
"session": httpx.AsyncClient(**kwargs),
"headers": {"User-Agent": self.user_agent},
# Disable CDN caching on sites like SCOTUS (ahem)
"cache-control": "no-cache, no-store, max-age=1",
"parameters": {},
Expand All @@ -65,8 +67,8 @@ def __init__(self, cnt=None):
self._req_attrs = []
self._all_attrs = []

def __del__(self):
self.close_session()
async def __aexit__(self):
await self.close_session()

def __str__(self):
out = []
Expand All @@ -84,9 +86,9 @@ def __getitem__(self, i):
def __len__(self):
return len(self.case_names)

def close_session(self):
async def close_session(self):
if self.request["session"]:
self.request["session"].close()
await self.request["session"].aclose()

def _make_item(self, i):
"""Using i, convert a single item into a dict. This is effectively a
Expand Down Expand Up @@ -344,23 +346,23 @@ def _process_request_parameters(self, parameters={}):
del parameters["verify"]
self.request["parameters"] = parameters

def _request_url_get(self, url):
async def _request_url_get(self, url):
"""Execute GET request and assign appropriate request dictionary
values
"""
self.request["url"] = url
self.request["response"] = self.request["session"].get(
self.request["response"] = await self.request["session"].get(
url,
headers=self.request["headers"],
verify=self.request["verify"],
timeout=60,
**self.request["parameters"],
)

def _request_url_post(self, url):
async def _request_url_post(self, url):
"""Execute POST request and assign appropriate request dictionary values"""
self.request["url"] = url
self.request["response"] = self.request["session"].post(
self.request["response"] = await self.request["session"].post(
url,
headers=self.request["headers"],
verify=self.request["verify"],
Expand Down
11 changes: 10 additions & 1 deletion juriscraper/DeferringList.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import asyncio
import inspect

from juriscraper.AbstractSite import logger


Expand Down Expand Up @@ -42,7 +45,13 @@ def __getitem__(self, item):
logger.info(
f"Getting deferred value from seed: {self._data[item]}"
)
new_val = self._fetching_function(self._data[item])
if inspect.isawaitable(self._fetching_function):
loop = asyncio.get_event_loop()
new_val = loop.run_until_complete(
self._fetching_function(self._data[item])
)
else:
new_val = self._fetching_function(self._data[item])
self._data[item] = new_val
self._fetched_items[item] = True
return new_val
Expand Down
4 changes: 3 additions & 1 deletion juriscraper/OpinionSiteLinearWebDriven.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
WebDriven.__init__(self, args, kwargs)

async def __aexit__(self):
await self.close_session()

def __del__(self):
self.close_session()
self.close_webdriver_session()
4 changes: 3 additions & 1 deletion juriscraper/OpinionSiteWebDriven.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
WebDriven.__init__(self, args, kwargs)

async def __aexit__(self):
await self.close_session()

def __del__(self):
self.close_session()
self.close_webdriver_session()
4 changes: 3 additions & 1 deletion juriscraper/OralArgumentSiteLinearWebDriven.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
WebDriven.__init__(self, args, kwargs)

async def __aexit__(self):
await self.close_session()

def __del__(self):
self.close_session()
self.close_webdriver_session()
41 changes: 22 additions & 19 deletions juriscraper/fdsys/FDSysSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,20 @@
from datetime import date
from pprint import pprint

import requests
import httpx
from httpx import InvalidURL
from lxml import etree
from requests.exceptions import MissingSchema

from juriscraper.AbstractSite import AbstractSite


def get_tree(url):
async def get_tree(url, **kwargs):
try:
response = requests.get(url, stream=True)
response.raw.decode_content = True
return etree.parse(response.raw)
except MissingSchema:
kwargs.setdefault("http2", True)
async with httpx.AsyncClient(**kwargs) as client:
response = await client.get(url)
return etree.parse(await response.aread())
except InvalidURL:
return etree.parse(url)


Expand Down Expand Up @@ -160,23 +161,25 @@ def __getitem__(self, i):
def __len__(self):
return len(xpath(self.html, "//s:loc/text()"))

def save_mods_file(self, url):
async def save_mods_file(self, url, **kwargs):
mods_url = FDSysModsContent._get_mods_file_url(url)
name = "-".join(mods_url.split("/")[-2].split("-")[1:])
with open(f"./examples/2006/{name}.xml", "w") as handle:
response = requests.get(mods_url, stream=True)
for block in response.iter_content(1024):
handle.write(block)

def _download(self, request_dict={}):
with open(f"./examples/2006/{name}.xml", "wb") as handle:
kwargs.setdefault("http2", True)
async with httpx.AsyncClient(**kwargs) as client:
async with client.stream("GET", mods_url) as response:
async for block in response.aiter_bytes():
handle.write(block)

async def _download(self, request_dict={}):
"""
it actually builds an XML tree
"""
return get_tree(self.url)
return await get_tree(self.url)

def _download_backwards(self, year):
async def _download_backwards(self, year):
self.url = self.base_url.format(year=year)
self.html = self._download()
self.html = await self._download()
if self.html is not None:
# Setting status is important because it prevents the download
# function from being run a second time by the parse method.
Expand All @@ -185,10 +188,10 @@ def _download_backwards(self, year):
def _check_sanity(self):
pass

def parse(self):
async def parse(self):
if self.status is None:
# Run the downloader if it hasn't been run already
self.html = self._download()
self.html = await self._download()
return self


Expand Down
28 changes: 17 additions & 11 deletions juriscraper/fdsys/scrape_court_names.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import asyncio
import json
from pprint import pprint

import requests
import httpx
from lxml import etree, html


def get_court_names():
response = requests.get("https://www.courtlistener.com/api/jurisdictions/")
async def get_court_names(**kwargs):
kwargs.setdefault("http2", True)
async with httpx.AsyncClient(**kwargs) as client:
response = await client.get(
"https://www.courtlistener.com/api/jurisdictions/"
)
tree = html.fromstring(response.text)

data = dict()
Expand All @@ -21,13 +26,14 @@ def get_court_names():
json.dump(data, f)


def get_fdsys_court_names():
response = requests.get(
"https://www.gpo.gov/smap/fdsys/sitemap_2014/2014_USCOURTS_sitemap.xml",
stream=True,
)
response.raw.decode_content = True
tree = etree.parse(response.raw)
async def get_fdsys_court_names(**kwargs):
kwargs.setdefault("http2", True)
async with httpx.AsyncClient(**kwargs) as client:
response = await client.get(
"https://www.gpo.gov/smap/fdsys/sitemap_2014/2014_USCOURTS_sitemap.xml"
)
tree = etree.parse(await response.aread())

data = dict()

for url in tree.xpath(
Expand All @@ -47,4 +53,4 @@ def get_fdsys_court_names():

if __name__ == "__main__":
# get_court_names()
get_fdsys_court_names()
asyncio.run(get_fdsys_court_names())
49 changes: 24 additions & 25 deletions juriscraper/lasc/http.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
import requests
import httpx
from lxml.html import fromstring

from ..lib.log_tools import make_default_logger

requests.packages.urllib3.disable_warnings(
requests.packages.urllib3.exceptions.InsecureRequestWarning
)

logger = make_default_logger()


class LASCSession(requests.Session):
class LASCSession(httpx.AsyncClient):
"""
A requests.Session object with special tooling to handle the Los Angeles
A httpx.AsyncClient object with special tooling to handle the Los Angeles
Superior Court Media Access portal.
"""

def __init__(self, username=None, password=None):
def __init__(
self, username=None, password=None, user_agent="Juriscraper", **kwargs
):
"""
Instantiate a new LASC HTTP Session with some Juriscraper defaults.
This method requires credentials from the media access portal.
Expand All @@ -25,7 +23,7 @@ def __init__(self, username=None, password=None):
:param password: MAP password
:return: A LASCSession object
"""
super().__init__()
super().__init__(**kwargs)

self.html = None

Expand Down Expand Up @@ -53,34 +51,35 @@ def __init__(self, username=None, password=None):
"password": password,
"request_type": "RESPONSE",
}
self.user_agent = user_agent
self.headers = {
"Origin": ms_base_url,
"User-Agent": "Juriscraper",
"User-Agent": self.user_agent,
}

def get(self, url, auto_login=False, **kwargs):
"""Overrides request.Session.get with session retry logic.
async def get(self, url, auto_login=False, **kwargs):
"""Overrides httpx.AsyncClient.get with session retry logic.

:param url: url string to GET
:param auto_login: Whether the auto-login procedure should happen.
:return: requests.Response
:return: httpx.Response
"""
kwargs.setdefault("timeout", 30)
kwargs.setdefault("params", {"p": "B2C_1_Media-LASC-SUSI"})

return super().get(url, **kwargs)
return await super().get(url, **kwargs)

def post(self, url, auto_login=False, **kwargs):
"""Overrides request.Session.post with session retry logic.
async def post(self, url, auto_login=False, **kwargs):
"""Overrides httpx.AsyncClient.post with session retry logic.

:param url: url string to GET
:param auto_login: Whether the auto-login procedure should happen.
:return: requests.Response
:return: httpx.Response
"""
kwargs.setdefault("timeout", 30)
kwargs.setdefault("params", {"p": "B2C_1_Media-LASC-SUSI"})

return super().post(url, **kwargs)
return await super().post(url, **kwargs)

@staticmethod
def _parse_new_html_for_keys(r):
Expand All @@ -89,7 +88,7 @@ def _parse_new_html_for_keys(r):
This method parses the HTML after the first login page and identifies
the parameter values required for the next step.

:param r: A request.Response object
:param r: A httpx.Response object
:return: A dict containing the needed keys
"""
html = fromstring(r.text)
Expand All @@ -103,7 +102,7 @@ def _parse_new_html_for_keys(r):
def _check_login(r):
"""Check that the login succeeded

:param r: A request.Response object
:param r: A httpx.Response object
:return: None
:raises LASCLoginException
"""
Expand All @@ -121,7 +120,7 @@ def _check_login(r):
def _update_header_token(self, r):
self.headers["X-CSRF-TOKEN"] = r.text.split("csrf")[1].split('"')[2]

def login(self):
async def login(self):
"""Log into the LASC Media Access Portal
The process is tricky, requiring two GET requests, each of which
returns HTML or JSON that is parsed for values to send in a subsequent
Expand Down Expand Up @@ -326,20 +325,20 @@ def login(self):
"""

logger.info("Logging into MAP has begun")
r = self.get(self.login_url)
r = await self.get(self.login_url)
self._update_header_token(r)

# Call part one of Microsoft login API
r = self.post(self.api_url1, data=self.login_data)
r = await self.post(self.api_url1, data=self.login_data)
self._check_login(r)

# Call part two of Microsoft login API - Redirect
r = self.get(self.api_url2)
r = await self.get(self.api_url2)

# Finalize login with post into LA MAP site
parsed_keys = self._parse_new_html_for_keys(r)

self.post(self.signin_url, data=parsed_keys)
await self.post(self.signin_url, data=parsed_keys)

logger.info("Successfully Logged into MAP")

Expand Down
Loading
Loading