Skip to content

Commit

Permalink
Migrate http client from requests to httpx async client
Browse files Browse the repository at this point in the history
  • Loading branch information
ttys0dev committed Oct 7, 2023
1 parent e5b458d commit ce26428
Show file tree
Hide file tree
Showing 45 changed files with 452 additions and 365 deletions.
26 changes: 14 additions & 12 deletions juriscraper/AbstractSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from datetime import date, datetime

import certifi
import requests
import httpx

from juriscraper.lib.date_utils import fix_future_year_typo, json_date_handler
from juriscraper.lib.exceptions import InsanityException
Expand Down Expand Up @@ -33,7 +33,7 @@ class AbstractSite:
Should not contain lists that can't be sorted by the _date_sort function.
"""

def __init__(self, cnt=None):
def __init__(self, cnt=None, user_agent="Juriscraper", **kwargs):
super().__init__()

# Computed metadata
Expand All @@ -44,10 +44,12 @@ def __init__(self, cnt=None):
self.downloader_executed = False
self.cookies = {}
self.cnt = cnt or CaseNameTweaker()
self.user_agent = user_agent
kwargs.setdefault("http2", True)
self.request = {
"verify": certifi.where(),
"session": requests.session(),
"headers": {"User-Agent": "Juriscraper"},
"session": httpx.AsyncClient(**kwargs),
"headers": {"User-Agent": self.user_agent},
# Disable CDN caching on sites like SCOTUS (ahem)
"cache-control": "no-cache, no-store, max-age=1",
"parameters": {},
Expand All @@ -65,8 +67,8 @@ def __init__(self, cnt=None):
self._req_attrs = []
self._all_attrs = []

def __del__(self):
self.close_session()
async def __aexit__(self):
await self.close_session()

def __str__(self):
out = []
Expand All @@ -84,9 +86,9 @@ def __getitem__(self, i):
def __len__(self):
return len(self.case_names)

def close_session(self):
async def close_session(self):
if self.request["session"]:
self.request["session"].close()
await self.request["session"].aclose()

def _make_item(self, i):
"""Using i, convert a single item into a dict. This is effectively a
Expand Down Expand Up @@ -344,23 +346,23 @@ def _process_request_parameters(self, parameters={}):
del parameters["verify"]
self.request["parameters"] = parameters

def _request_url_get(self, url):
async def _request_url_get(self, url):
"""Execute GET request and assign appropriate request dictionary
values
"""
self.request["url"] = url
self.request["response"] = self.request["session"].get(
self.request["response"] = await self.request["session"].get(
url,
headers=self.request["headers"],
verify=self.request["verify"],
timeout=60,
**self.request["parameters"],
)

def _request_url_post(self, url):
async def _request_url_post(self, url):
"""Execute POST request and assign appropriate request dictionary values"""
self.request["url"] = url
self.request["response"] = self.request["session"].post(
self.request["response"] = await self.request["session"].post(
url,
headers=self.request["headers"],
verify=self.request["verify"],
Expand Down
11 changes: 10 additions & 1 deletion juriscraper/DeferringList.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import asyncio
import inspect

from juriscraper.AbstractSite import logger


Expand Down Expand Up @@ -42,7 +45,13 @@ def __getitem__(self, item):
logger.info(
f"Getting deferred value from seed: {self._data[item]}"
)
new_val = self._fetching_function(self._data[item])
if inspect.isawaitable(self._fetching_function):
loop = asyncio.get_event_loop()
new_val = loop.run_until_complete(
self._fetching_function(self._data[item])
)
else:
new_val = self._fetching_function(self._data[item])
self._data[item] = new_val
self._fetched_items[item] = True
return new_val
Expand Down
4 changes: 3 additions & 1 deletion juriscraper/OpinionSiteLinearWebDriven.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
WebDriven.__init__(self, args, kwargs)

async def __aexit__(self):
await self.close_session()

def __del__(self):
self.close_session()
self.close_webdriver_session()
4 changes: 3 additions & 1 deletion juriscraper/OpinionSiteWebDriven.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
WebDriven.__init__(self, args, kwargs)

async def __aexit__(self):
await self.close_session()

def __del__(self):
self.close_session()
self.close_webdriver_session()
4 changes: 3 additions & 1 deletion juriscraper/OralArgumentSiteLinearWebDriven.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
WebDriven.__init__(self, args, kwargs)

async def __aexit__(self):
await self.close_session()

def __del__(self):
self.close_session()
self.close_webdriver_session()
41 changes: 22 additions & 19 deletions juriscraper/fdsys/FDSysSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,20 @@
from datetime import date
from pprint import pprint

import requests
import httpx
from httpx import InvalidURL
from lxml import etree
from requests.exceptions import MissingSchema

from juriscraper.AbstractSite import AbstractSite


def get_tree(url):
async def get_tree(url, **kwargs):
try:
response = requests.get(url, stream=True)
response.raw.decode_content = True
return etree.parse(response.raw)
except MissingSchema:
kwargs.setdefault("http2", True)
async with httpx.AsyncClient(**kwargs) as client:
response = await client.get(url)
return etree.parse(await response.aread())
except InvalidURL:
return etree.parse(url)


Expand Down Expand Up @@ -160,23 +161,25 @@ def __getitem__(self, i):
def __len__(self):
return len(xpath(self.html, "//s:loc/text()"))

def save_mods_file(self, url):
async def save_mods_file(self, url, **kwargs):
mods_url = FDSysModsContent._get_mods_file_url(url)
name = "-".join(mods_url.split("/")[-2].split("-")[1:])
with open(f"./examples/2006/{name}.xml", "w") as handle:
response = requests.get(mods_url, stream=True)
for block in response.iter_content(1024):
handle.write(block)

def _download(self, request_dict={}):
with open(f"./examples/2006/{name}.xml", "wb") as handle:
kwargs.setdefault("http2", True)
async with httpx.AsyncClient(**kwargs) as client:
async with client.stream("GET", mods_url) as response:
async for block in response.aiter_bytes():
handle.write(block)

async def _download(self, request_dict={}):
"""
it actually builds an XML tree
"""
return get_tree(self.url)
return await get_tree(self.url)

def _download_backwards(self, year):
async def _download_backwards(self, year):
self.url = self.base_url.format(year=year)
self.html = self._download()
self.html = await self._download()
if self.html is not None:
# Setting status is important because it prevents the download
# function from being run a second time by the parse method.
Expand All @@ -185,10 +188,10 @@ def _download_backwards(self, year):
def _check_sanity(self):
pass

def parse(self):
async def parse(self):
if self.status is None:
# Run the downloader if it hasn't been run already
self.html = self._download()
self.html = await self._download()
return self


Expand Down
28 changes: 17 additions & 11 deletions juriscraper/fdsys/scrape_court_names.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import asyncio
import json
from pprint import pprint

import requests
import httpx
from lxml import etree, html


def get_court_names():
response = requests.get("https://www.courtlistener.com/api/jurisdictions/")
async def get_court_names(**kwargs):
kwargs.setdefault("http2", True)
async with httpx.AsyncClient(**kwargs) as client:
response = await client.get(
"https://www.courtlistener.com/api/jurisdictions/"
)
tree = html.fromstring(response.text)

data = dict()
Expand All @@ -21,13 +26,14 @@ def get_court_names():
json.dump(data, f)


def get_fdsys_court_names():
response = requests.get(
"https://www.gpo.gov/smap/fdsys/sitemap_2014/2014_USCOURTS_sitemap.xml",
stream=True,
)
response.raw.decode_content = True
tree = etree.parse(response.raw)
async def get_fdsys_court_names(**kwargs):
kwargs.setdefault("http2", True)
async with httpx.AsyncClient(**kwargs) as client:
response = await client.get(
"https://www.gpo.gov/smap/fdsys/sitemap_2014/2014_USCOURTS_sitemap.xml"
)
tree = etree.parse(await response.aread())

data = dict()

for url in tree.xpath(
Expand All @@ -47,4 +53,4 @@ def get_fdsys_court_names():

if __name__ == "__main__":
# get_court_names()
get_fdsys_court_names()
asyncio.run(get_fdsys_court_names())
49 changes: 24 additions & 25 deletions juriscraper/lasc/http.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
import requests
import httpx
from lxml.html import fromstring

from ..lib.log_tools import make_default_logger

requests.packages.urllib3.disable_warnings(
requests.packages.urllib3.exceptions.InsecureRequestWarning
)

logger = make_default_logger()


class LASCSession(requests.Session):
class LASCSession(httpx.AsyncClient):
"""
A requests.Session object with special tooling to handle the Los Angeles
A httpx.AsyncClient object with special tooling to handle the Los Angeles
Superior Court Media Access portal.
"""

def __init__(self, username=None, password=None):
def __init__(
self, username=None, password=None, user_agent="Juriscraper", **kwargs
):
"""
Instantiate a new LASC HTTP Session with some Juriscraper defaults.
This method requires credentials from the media access portal.
Expand All @@ -25,7 +23,7 @@ def __init__(self, username=None, password=None):
:param password: MAP password
:return: A LASCSession object
"""
super().__init__()
super().__init__(**kwargs)

self.html = None

Expand Down Expand Up @@ -53,34 +51,35 @@ def __init__(self, username=None, password=None):
"password": password,
"request_type": "RESPONSE",
}
self.user_agent = user_agent
self.headers = {
"Origin": ms_base_url,
"User-Agent": "Juriscraper",
"User-Agent": self.user_agent,
}

def get(self, url, auto_login=False, **kwargs):
"""Overrides request.Session.get with session retry logic.
async def get(self, url, auto_login=False, **kwargs):
"""Overrides httpx.AsyncClient.get with session retry logic.
:param url: url string to GET
:param auto_login: Whether the auto-login procedure should happen.
:return: requests.Response
:return: httpx.Response
"""
kwargs.setdefault("timeout", 30)
kwargs.setdefault("params", {"p": "B2C_1_Media-LASC-SUSI"})

return super().get(url, **kwargs)
return await super().get(url, **kwargs)

def post(self, url, auto_login=False, **kwargs):
"""Overrides request.Session.post with session retry logic.
async def post(self, url, auto_login=False, **kwargs):
"""Overrides httpx.AsyncClient.post with session retry logic.
:param url: url string to GET
:param auto_login: Whether the auto-login procedure should happen.
:return: requests.Response
:return: httpx.Response
"""
kwargs.setdefault("timeout", 30)
kwargs.setdefault("params", {"p": "B2C_1_Media-LASC-SUSI"})

return super().post(url, **kwargs)
return await super().post(url, **kwargs)

@staticmethod
def _parse_new_html_for_keys(r):
Expand All @@ -89,7 +88,7 @@ def _parse_new_html_for_keys(r):
This method parses the HTML after the first login page and identifies
the parameter values required for the next step.
:param r: A request.Response object
:param r: A httpx.Response object
:return: A dict containing the needed keys
"""
html = fromstring(r.text)
Expand All @@ -103,7 +102,7 @@ def _parse_new_html_for_keys(r):
def _check_login(r):
"""Check that the login succeeded
:param r: A request.Response object
:param r: A httpx.Response object
:return: None
:raises LASCLoginException
"""
Expand All @@ -121,7 +120,7 @@ def _check_login(r):
def _update_header_token(self, r):
self.headers["X-CSRF-TOKEN"] = r.text.split("csrf")[1].split('"')[2]

def login(self):
async def login(self):
"""Log into the LASC Media Access Portal
The process is tricky, requiring two GET requests, each of which
returns HTML or JSON that is parsed for values to send in a subsequent
Expand Down Expand Up @@ -326,20 +325,20 @@ def login(self):
"""

logger.info("Logging into MAP has begun")
r = self.get(self.login_url)
r = await self.get(self.login_url)
self._update_header_token(r)

# Call part one of Microsoft login API
r = self.post(self.api_url1, data=self.login_data)
r = await self.post(self.api_url1, data=self.login_data)
self._check_login(r)

# Call part two of Microsoft login API - Redirect
r = self.get(self.api_url2)
r = await self.get(self.api_url2)

# Finalize login with post into LA MAP site
parsed_keys = self._parse_new_html_for_keys(r)

self.post(self.signin_url, data=parsed_keys)
await self.post(self.signin_url, data=parsed_keys)

logger.info("Successfully Logged into MAP")

Expand Down
Loading

0 comments on commit ce26428

Please sign in to comment.