From e169a269ae92a8a4e7fd4d2773eead67339599b9 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 1 Feb 2024 10:19:25 -0800 Subject: [PATCH 1/3] Add test for achived rate limit responses --- src/wayback/tests/test_client.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/wayback/tests/test_client.py b/src/wayback/tests/test_client.py index f236a76..0a30f82 100644 --- a/src/wayback/tests/test_client.py +++ b/src/wayback/tests/test_client.py @@ -609,6 +609,16 @@ def test_get_memento_raises_no_memento_error(): '20170929002712') +@ia_vcr.use_cassette() +def test_get_memento_works_on_archived_rate_limit_responses(): + with WaybackClient() as client: + memento = client.get_memento('http://www.reddit.com/r/PokemonGiveaway', + timestamp=datetime(2015, 1, 29, 3, 49, 4), + exact=True) + assert 'http://www.reddit.com/r/PokemonGiveaway' == memento.url + assert 429 == memento.status_code + + @ia_vcr.use_cassette() def test_get_memento_follows_historical_redirects(): with WaybackClient() as client: From e8d71981024a1b98044175b22c584bae92ab16a6 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 1 Feb 2024 10:28:33 -0800 Subject: [PATCH 2/3] Return instead of raising on archived 429s Fixes #158. --- src/wayback/_client.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/wayback/_client.py b/src/wayback/_client.py index 0593e8f..4d1a3cf 100644 --- a/src/wayback/_client.py +++ b/src/wayback/_client.py @@ -455,7 +455,11 @@ def send(self, request: requests.PreparedRequest, **kwargs): response = super().send(request, **kwargs) retry_delay = self.get_retry_delay(retries, response) - if retries >= maximum or not self.should_retry(response): + if is_memento_response(response): + # Mementos are necessarily successful responses, so just + # return them without any other checks. + return response + elif retries >= maximum or not self.should_retry(response): if response.status_code == 429: read_and_close(response) raise RateLimitError(response, retry_delay) @@ -498,10 +502,6 @@ def request(self, method, url, **kwargs): return super().request(method, url, **kwargs) def should_retry(self, response): - # A memento may actually be a capture of an error, so don't retry it :P - if is_memento_response(response): - return False - return response.status_code in self.retryable_statuses def should_retry_error(self, error): From 770113a115a104224d87e5775c36460d615150f7 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 1 Feb 2024 10:36:42 -0800 Subject: [PATCH 3/3] Oops, forgot to include the test cassette --- ...orks_on_archived_rate_limit_responses.yaml | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 src/wayback/tests/cassettes/test_get_memento_works_on_archived_rate_limit_responses.yaml diff --git a/src/wayback/tests/cassettes/test_get_memento_works_on_archived_rate_limit_responses.yaml b/src/wayback/tests/cassettes/test_get_memento_works_on_archived_rate_limit_responses.yaml new file mode 100644 index 0000000..dab5f46 --- /dev/null +++ b/src/wayback/tests/cassettes/test_get_memento_works_on_archived_rate_limit_responses.yaml @@ -0,0 +1,113 @@ +interactions: +- request: + body: null + headers: + Accept-Encoding: + - gzip, deflate + User-Agent: + - wayback/0.4.5.dev10+gb7a16cd.d20231218 (+https://github.com/edgi-govdata-archiving/wayback) + method: GET + uri: https://web.archive.org/web/20150129034904id_/http://www.reddit.com/r/PokemonGiveaway + response: + body: + string: "\n\n\n \n Too Many Requests\n + \ \n \n \n + \

whoa there, pardner!

\n \n\n\n

we're sorry, but you appear + to be a bot and we've seen too many requests\nfrom you lately. we enforce + a hard speed limit on requests that appear to come\nfrom bots to prevent abuse.

\n\n

if + you are not a bot but are spoofing one via your browser's user agent\nstring: + please change your user agent string to avoid seeing this message\nagain.

\n\n

please + wait 6 second(s) and try again.

\n\n

as a reminder to developers, + we recommend that clients make no\n more than one\n + \ request every two seconds to avoid seeing this message.

\n \n\n" + headers: + Connection: + - keep-alive + Content-Type: + - text/html; charset=UTF-8 + Date: + - Thu, 01 Feb 2024 18:20:31 GMT + Permissions-Policy: + - interest-cohort=() + Referrer-Policy: + - no-referrer-when-downgrade + Server: + - nginx + Transfer-Encoding: + - chunked + X-NA: + - '0' + X-NID: + - '-' + X-Page-Cache: + - MISS + X-RL: + - '1' + X-location: + - All + cache-control: + - max-age=1800 + content-security-policy: + - 'default-src ''self'' ''unsafe-eval'' ''unsafe-inline'' data: blob: archive.org + web.archive.org web-static.archive.org wayback-api.archive.org analytics.archive.org + pragma.archivelab.org' + link: + - ; rel="original", ; + rel="timemap"; type="application/link-format", ; + rel="timegate", ; + rel="first memento"; datetime="Tue, 26 Jun 2012 00:00:27 GMT", ; + rel="prev memento"; datetime="Tue, 09 Dec 2014 12:01:44 GMT", ; + rel="memento"; datetime="Thu, 29 Jan 2015 03:49:04 GMT", ; + rel="next memento"; datetime="Sun, 08 Feb 2015 03:27:10 GMT", ; + rel="last memento"; datetime="Fri, 20 Oct 2023 10:43:50 GMT" + memento-datetime: + - Thu, 29 Jan 2015 03:49:04 GMT + server-timing: + - exclusion.robots;dur=1.346979, exclusion.robots.policy;dur=1.258865, cdx.remote;dur=0.566878, + esindex;dur=0.070942, LoadShardBlock;dur=668.835646, PetaboxLoader3.datanode;dur=362.949615, + PetaboxLoader3.resolve;dur=109.386489, load_resource;dur=78.884440 + x-app-server: + - wwwb-app220 + x-archive-orig-cache-control: + - no-cache + x-archive-orig-cf-cache-status: + - EXPIRED + x-archive-orig-cf-ray: + - 1b02752d98b0012c-SJC + x-archive-orig-connection: + - close + x-archive-orig-content-length: + - '-1' + x-archive-orig-date: + - Thu, 29 Jan 2015 03:49:04 GMT + x-archive-orig-edge-control: + - bypass-cache + x-archive-orig-retry-after: + - '6' + x-archive-orig-server: + - cloudflare-nginx + x-archive-orig-vary: + - accept-encoding + x-archive-orig-x-content-type-options: + - nosniff + x-archive-orig-x-frame-options: + - SAMEORIGIN + x-archive-orig-x-moose: + - majestic + x-archive-orig-x-ua-compatible: + - IE=edge + x-archive-orig-x-xss-protection: + - 1; mode=block + x-archive-src: + - liveweb-20150129011011/live-20150129000440-wwwb-app16.us.archive.org.warc.gz + x-tr: + - '1820' + x-ts: + - '429' + status: + code: 429 + message: Too Many Requests +version: 1