-
Notifications
You must be signed in to change notification settings - Fork 141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add SOCKS support to proxy configuration parameter #1861
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
""" | ||
Implements 2 aspects of network proxy support: | ||
|
||
1. Detecting proxy configuration in the runtime environment | ||
2. Configuring aiohttp for different proxy protocol families | ||
""" | ||
|
||
import logging | ||
import urllib.request | ||
from collections.abc import Mapping | ||
from typing import Any | ||
|
||
from aiohttp_socks import ProxyConnector | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
# The protocols we accept from 'getproxies()' in the an arbitrary but reasonable seeming precedence order. | ||
# These roughly correspond to environment variables `(f"{p.upper()}_PROXY" for p in _supported_protocols)`. | ||
_supported_protocols = ( | ||
"socks5", | ||
"socks4", | ||
"socks", | ||
"https", | ||
"http", | ||
"all", | ||
) | ||
|
||
|
||
def proxy_address_from_env() -> str | None: | ||
""" | ||
Find an HTTP or SOCKS proxy server URL in the environment using urllib's | ||
'getproxies' function. This checks both environment variables and OS-specific sources | ||
like the Windows registry and returns a mapping of protocol name to address. If there | ||
are URLs for multiple protocols we use an arbitrary precedence order based roughly on | ||
protocol sophistication and specificity: | ||
|
||
'socks5' > 'socks4' > 'https' > 'http' > 'all' | ||
|
||
Note that nothing actually constrains the value of an environment variable to have a | ||
URI scheme/protocol that matches the protocol indicated by the variable name - e.g. | ||
not only is `ALL_PROXY=socks4://...` possible but so is `HTTP_PROXY=socks4://...`. We | ||
use the protocol from the variable name for address selection but should generate | ||
connection configuration based on the scheme. | ||
""" | ||
proxies_in_env = urllib.request.getproxies() | ||
for proto in _supported_protocols: | ||
if proto in proxies_in_env: | ||
address = proxies_in_env[proto] | ||
logger.debug("found %s proxy address in environment: %s", proto, address) | ||
return address | ||
return None | ||
|
||
|
||
def get_aiohttp_proxy_kwargs(proxy_url: str) -> Mapping[str, Any]: | ||
""" | ||
Return initializer keyword arguments for `aiohttp.ClientSession` for either an HTTP | ||
or SOCKS proxy based on the scheme of the given URL. | ||
|
||
Proxy support uses aiohttp's built-in support for HTTP(S), and uses aiohttp_socks for | ||
SOCKS{4,5}. Initializing an aiohttp session is different for each. An HTTP proxy | ||
address can be passed to ClientSession's 'proxy' option: | ||
|
||
ClientSession(..., proxy=<PROXY_ADDRESS>, trust_env=True) | ||
|
||
'trust_env' enables aiohttp to read additional configuration from environment variables | ||
and net.rc. `aiohttp_socks` works by replacing the default transport (TcpConnector) | ||
with a custom one: | ||
|
||
socks_transport = aiohttp_socks.ProxyConnector.from_url(<PROXY_ADDRESS>) | ||
ClientSession(..., connector=socks_transport) | ||
|
||
This uses the protocol family of the URL to select one or the other and return the | ||
corresponding keyword arguments in a dictionary. | ||
""" | ||
lowered = proxy_url.lower() | ||
if lowered.startswith("socks"): | ||
logger.debug("using SOCKS ProxyConnector for %s", proxy_url) | ||
return {"connector": ProxyConnector.from_url(proxy_url)} | ||
|
||
if lowered.startswith("http"): | ||
logger.debug("using HTTP proxy address %s", proxy_url) | ||
return {"proxy": proxy_url, "trust_env": True} | ||
|
||
return {} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import os | ||
from unittest.mock import patch | ||
|
||
import pytest | ||
from aiohttp_socks import ProxyConnector | ||
|
||
from bandersnatch.config.proxy import get_aiohttp_proxy_kwargs, proxy_address_from_env | ||
from bandersnatch.master import Master | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("mock_env", "expected_result"), | ||
[ | ||
# No environment variables => no configuration | ||
({}, None), | ||
# Unsupported protocol => no configuration | ||
({"WSS_PROXY": "wss://192.0.2.100"}, None), | ||
# Detect proto "http" | ||
({"HTTP_PROXY": "http://192.0.2.111"}, "http://192.0.2.111"), | ||
# Detect proto "socks4" | ||
({"SOCKS4_PROXY": "socks4://192.0.2.112:1080"}, "socks4://192.0.2.112:1080"), | ||
# Detect ALL_PROXY | ||
({"ALL_PROXY": "socks5://192.0.2.114:1080"}, "socks5://192.0.2.114:1080"), | ||
# prefer https to http, if both are set | ||
( | ||
{"HTTP_PROXY": "http://192.0.2.111", "HTTPS_PROXY": "https://192.0.2.112"}, | ||
"https://192.0.2.112", | ||
), | ||
# prefer socks to http and socks5 to socks4 | ||
( | ||
{ | ||
"HTTPS_PROXY": "https://192.0.2.112", | ||
"SOCKS4_PROXY": "socks4://192.0.2.113:1080", | ||
"SOCKS5_PROXY": "socks5://192.0.2.114:1080", | ||
}, | ||
"socks5://192.0.2.114:1080", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we put in an IPv6 URL for good measure? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that is a good idea - I was tripped up several times by None of these new tests cover all the way through to when the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I think there we should rely on aiohttp tests. But I just want to know we pass IPv6 correctly if the libraries support it (pretty sure they do). |
||
), | ||
], | ||
) | ||
def test_proxy_address_from_env( | ||
mock_env: dict[str, str], expected_result: str | None | ||
) -> None: | ||
with patch.dict(os.environ, mock_env, clear=True): | ||
actual_result = proxy_address_from_env() | ||
assert actual_result == expected_result | ||
|
||
|
||
@pytest.mark.parametrize("arg", ["", " ", "bleh", "wss://192.0.2.113"]) | ||
def test_get_aiohttp_proxy_kwargs__unsupported_arguments( | ||
arg: str, | ||
) -> None: | ||
assert get_aiohttp_proxy_kwargs(arg) == {} | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"arg", ["http://192.0.2.111", "https://192.0.2.112", "HTTPS://192.0.2.112"] | ||
) | ||
def test_get_aiohttp_proxy_kwargs__http_urls(arg: str) -> None: | ||
assert get_aiohttp_proxy_kwargs(arg) == {"proxy": arg, "trust_env": True} | ||
|
||
|
||
# (1) Although 'get_aiohttp_proxy_kwargs' is synchronous, creating an 'aiohttp_socks.ProxyConnector' | ||
# requires an event loop b/c its initializer calls 'asyncio.get_running_loop()'. | ||
# (2) We can't verify ProxyConnector objects with __eq__ (it isn't overriden and checks reference equality) | ||
# and it doesn't expose any public attributes containing the host address it was instantiated with AFAICT, | ||
# so all of the following tests have some weird contortions for SOCKS cases. | ||
# (3) But we also don't want to completely mock away ProxyConnector.from_url b/c - I discovered - that | ||
# bypasses some significant runtime constraints on the URLs it accepts: | ||
# - it requires the URI have a port, even though socks4/5 have a default port | ||
# - it doesn't support socks{4,5}h as URI schemes | ||
@pytest.mark.asyncio | ||
@pytest.mark.parametrize( | ||
"arg", | ||
[ | ||
"socks4://198.51.100.111:1080", | ||
"socks5://198.51.100.112:1080", | ||
"SOCKS5://198.51.100.112:1080", | ||
], | ||
) | ||
async def test_get_aiohttp_proxy_kwargs__socks_urls(arg: str) -> None: | ||
with patch.object(ProxyConnector, "from_url", wraps=ProxyConnector.from_url): | ||
result = get_aiohttp_proxy_kwargs(arg) | ||
assert "connector" in result | ||
assert isinstance(result["connector"], ProxyConnector) | ||
# mypy in vs code marks this as an 'attr-defined' error, but if you add '# type: ignore' | ||
# then mypy in pre-commit marks this as an 'unused-ignore' error | ||
ProxyConnector.from_url.assert_called_with(arg) | ||
|
||
|
||
# The following tests of Master.__init__ inspect the 'proxy_kwargs' attribute, | ||
# which should really be a private implementation detail, but it makes | ||
# verification much easier. | ||
|
||
|
||
@pytest.mark.asyncio | ||
@pytest.mark.parametrize( | ||
("arg", "expected_shape"), | ||
[ | ||
(None, {}), | ||
("", {}), | ||
("http://192.0.2.111", {"proxy": str}), | ||
("socks4://198.51.100.111:1080", {"connector": ProxyConnector}), | ||
], | ||
) | ||
async def test_master_init__with_proxy_kwarg( | ||
arg: str, expected_shape: dict[str, type] | ||
) -> None: | ||
# clear os.environ so that urllib.request.getproxies doesn't pick up | ||
# possible proxy configuration on the test runner | ||
with patch.dict(os.environ, {}, clear=True): | ||
mas = Master("https://unit.test/simple/", proxy=arg) | ||
sut = mas.proxy_kwargs | ||
if expected_shape == {}: | ||
assert sut == {} | ||
else: | ||
for key, typ in expected_shape.items(): | ||
assert key in sut | ||
assert isinstance(sut[key], typ) | ||
|
||
|
||
@pytest.mark.asyncio | ||
@pytest.mark.parametrize( | ||
("mock_env", "expected_shape"), | ||
[ | ||
({}, {}), | ||
({"WSS_PROXY": "wss://foo.bar.baz"}, {}), | ||
({"http_proxy": "http://192.0.2.111"}, {"proxy": str}), | ||
( | ||
{"SOCKS_PROXY": "socks4://198.51.100.111:1080"}, | ||
{"connector": ProxyConnector}, | ||
), | ||
({"ALL_PROXY": "socks5://198.51.100.112:1080"}, {"connector": ProxyConnector}), | ||
( | ||
{ | ||
"http_proxy": "http://192.0.2.111", | ||
"all_proxy": "socks5://198.51.100.112:1080", | ||
}, | ||
{"proxy": str}, | ||
), | ||
({"socks4_proxy": "http://lolnotsocks.test:8080"}, {"proxy": str}), | ||
], | ||
) | ||
async def test_master_init__with_proxy_env( | ||
mock_env: dict[str, str], expected_shape: dict[str, type] | ||
) -> None: | ||
with patch.dict(os.environ, mock_env, clear=True): | ||
mas = Master("https://unit.test/simple/", proxy=None) | ||
sut = mas.proxy_kwargs | ||
if expected_shape == {}: | ||
assert sut == {} | ||
else: | ||
assert sut != {} | ||
for key, typ in expected_shape.items(): | ||
assert key in sut | ||
assert isinstance(sut[key], typ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wonder if we want to always tell people we're using a proxy?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very good point, if someone configured a proxy (...or didn't but one is used because of an environment variable), it would be important to know.
I left the messages in
bandersnatch.config.proxy
at 'debug', and added an 'info' level message to the Master initializer, with the idea that it's a little less verbose by default but can still optionally be traced in more detail.