Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't raise on *archived* rate limit errors #159

Merged
merged 3 commits into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/wayback/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,11 @@ def send(self, request: requests.PreparedRequest, **kwargs):
response = super().send(request, **kwargs)
retry_delay = self.get_retry_delay(retries, response)

if retries >= maximum or not self.should_retry(response):
if is_memento_response(response):
# Mementos are necessarily successful responses, so just
# return them without any other checks.
return response
elif retries >= maximum or not self.should_retry(response):
if response.status_code == 429:
read_and_close(response)
raise RateLimitError(response, retry_delay)
Expand Down Expand Up @@ -498,10 +502,6 @@ def request(self, method, url, **kwargs):
return super().request(method, url, **kwargs)

def should_retry(self, response):
# A memento may actually be a capture of an error, so don't retry it :P
if is_memento_response(response):
return False

return response.status_code in self.retryable_statuses

def should_retry_error(self, error):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
interactions:
- request:
body: null
headers:
Accept-Encoding:
- gzip, deflate
User-Agent:
- wayback/0.4.5.dev10+gb7a16cd.d20231218 (+https://github.com/edgi-govdata-archiving/wayback)
method: GET
uri: https://web.archive.org/web/20150129034904id_/http://www.reddit.com/r/PokemonGiveaway
response:
body:
string: "\n<!doctype html>\n<html>\n <head>\n <title>Too Many Requests</title>\n
\ <style>\n body {\n font: small verdana, arial, helvetica,
sans-serif;\n width: 600px;\n margin: 0 auto;\n }\n\n
\ h1 {\n height: 40px;\n background: transparent url(//www.redditstatic.com/reddit.com.header.png)
no-repeat scroll top right;\n }\n </style>\n </head>\n <body>\n
\ <h1>whoa there, pardner!</h1>\n \n\n\n<p>we're sorry, but you appear
to be a bot and we've seen too many requests\nfrom you lately. we enforce
a hard speed limit on requests that appear to come\nfrom bots to prevent abuse.</p>\n\n<p>if
you are not a bot but are spoofing one via your browser's user agent\nstring:
please change your user agent string to avoid seeing this message\nagain.</p>\n\n<p>please
wait 6 second(s) and try again.</p>\n\n <p>as a reminder to developers,
we recommend that clients make no\n more than <a href=\"http://github.com/reddit/reddit/wiki/API\">one\n
\ request every two seconds</a> to avoid seeing this message.</p>\n </body>\n</html>\n"
headers:
Connection:
- keep-alive
Content-Type:
- text/html; charset=UTF-8
Date:
- Thu, 01 Feb 2024 18:20:31 GMT
Permissions-Policy:
- interest-cohort=()
Referrer-Policy:
- no-referrer-when-downgrade
Server:
- nginx
Transfer-Encoding:
- chunked
X-NA:
- '0'
X-NID:
- '-'
X-Page-Cache:
- MISS
X-RL:
- '1'
X-location:
- All
cache-control:
- max-age=1800
content-security-policy:
- 'default-src ''self'' ''unsafe-eval'' ''unsafe-inline'' data: blob: archive.org
web.archive.org web-static.archive.org wayback-api.archive.org analytics.archive.org
pragma.archivelab.org'
link:
- <http://www.reddit.com/r/PokemonGiveaway>; rel="original", <https://web.archive.org/web/timemap/link/http://www.reddit.com/r/PokemonGiveaway>;
rel="timemap"; type="application/link-format", <https://web.archive.org/web/http://www.reddit.com/r/PokemonGiveaway>;
rel="timegate", <https://web.archive.org/web/20120626000027/http://www.reddit.com:80/r/Pokemongiveaway>;
rel="first memento"; datetime="Tue, 26 Jun 2012 00:00:27 GMT", <https://web.archive.org/web/20141209120144/http://www.reddit.com:80/r/Pokemongiveaway/>;
rel="prev memento"; datetime="Tue, 09 Dec 2014 12:01:44 GMT", <https://web.archive.org/web/20150129034904/http://www.reddit.com/r/PokemonGiveaway>;
rel="memento"; datetime="Thu, 29 Jan 2015 03:49:04 GMT", <https://web.archive.org/web/20150208032710/http://www.reddit.com:80/r/Pokemongiveaway>;
rel="next memento"; datetime="Sun, 08 Feb 2015 03:27:10 GMT", <https://web.archive.org/web/20231020104350/https://www.reddit.com/r/Pokemongiveaway/>;
rel="last memento"; datetime="Fri, 20 Oct 2023 10:43:50 GMT"
memento-datetime:
- Thu, 29 Jan 2015 03:49:04 GMT
server-timing:
- exclusion.robots;dur=1.346979, exclusion.robots.policy;dur=1.258865, cdx.remote;dur=0.566878,
esindex;dur=0.070942, LoadShardBlock;dur=668.835646, PetaboxLoader3.datanode;dur=362.949615,
PetaboxLoader3.resolve;dur=109.386489, load_resource;dur=78.884440
x-app-server:
- wwwb-app220
x-archive-orig-cache-control:
- no-cache
x-archive-orig-cf-cache-status:
- EXPIRED
x-archive-orig-cf-ray:
- 1b02752d98b0012c-SJC
x-archive-orig-connection:
- close
x-archive-orig-content-length:
- '-1'
x-archive-orig-date:
- Thu, 29 Jan 2015 03:49:04 GMT
x-archive-orig-edge-control:
- bypass-cache
x-archive-orig-retry-after:
- '6'
x-archive-orig-server:
- cloudflare-nginx
x-archive-orig-vary:
- accept-encoding
x-archive-orig-x-content-type-options:
- nosniff
x-archive-orig-x-frame-options:
- SAMEORIGIN
x-archive-orig-x-moose:
- majestic
x-archive-orig-x-ua-compatible:
- IE=edge
x-archive-orig-x-xss-protection:
- 1; mode=block
x-archive-src:
- liveweb-20150129011011/live-20150129000440-wwwb-app16.us.archive.org.warc.gz
x-tr:
- '1820'
x-ts:
- '429'
status:
code: 429
message: Too Many Requests
version: 1
10 changes: 10 additions & 0 deletions src/wayback/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,16 @@ def test_get_memento_raises_no_memento_error():
'20170929002712')


@ia_vcr.use_cassette()
def test_get_memento_works_on_archived_rate_limit_responses():
with WaybackClient() as client:
memento = client.get_memento('http://www.reddit.com/r/PokemonGiveaway',
timestamp=datetime(2015, 1, 29, 3, 49, 4),
exact=True)
assert 'http://www.reddit.com/r/PokemonGiveaway' == memento.url
assert 429 == memento.status_code


@ia_vcr.use_cassette()
def test_get_memento_follows_historical_redirects():
with WaybackClient() as client:
Expand Down