diff --git a/README.md b/README.md index 9aa6d36..f9dc9d4 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,32 @@ There is no need to override this delay unless cloudflare-scrape generates an er scraper = cfscrape.create_scraper(delay=10) ``` +### ReCAPTCHA + +You can use the `captcha` event hook to implement reCAPTCHA solving. It's similar to Requests' [response event hook](https://requests.kennethreitz.org/en/stable/user/advanced/#event-hooks). However, this particular hook function should be set on the scraper instance. + +reCAPTCHA solving steps: +1. Extract the siteKey and any form values from the response +2. Submit the siteKey and page URL to a captcha solving service +3. Retrieve the token from said captcha solving service +4. Send the form with token to Cloudflare + +```python +scraper = cfscrape.create_scraper() +# This URL points to a site that is dedicated to Cloudflare's reCAPTCHA. +url = 'https://captcha.website' + +def solve_captcha(resp, *args, **kwargs): + # After performing all other steps, submit the form + params = { 'g-recaptcha-response': 'token', 's': 'secret' } + resp = scraper.get('{}/cdn-cgi/l/chk_captcha'.format(url), params=params) + # Cloudflare should have responded with the requested content + return resp + +scraper.hooks['captcha'] = solve_captcha +print scraper.get(url).content # => "..." +``` + ## Integration It's easy to integrate cloudflare-scrape with other applications and tools. Cloudflare uses two cookies as tokens: one to verify you made it past their challenge page and one to track your session. To bypass the challenge page, simply include both of these cookies (with the appropriate user-agent) in all HTTP requests you make. diff --git a/cfscrape/__init__.py b/cfscrape/__init__.py index 38debd4..4e1f188 100644 --- a/cfscrape/__init__.py +++ b/cfscrape/__init__.py @@ -15,6 +15,7 @@ from requests.adapters import HTTPAdapter from requests.compat import urlparse, urlunparse from requests.exceptions import RequestException +from requests.hooks import dispatch_hook from urllib3.util.ssl_ import create_urllib3_context, DEFAULT_CIPHERS @@ -119,16 +120,28 @@ def is_cloudflare_captcha_challenge(resp): def request(self, method, url, *args, **kwargs): resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs) - # Check if Cloudflare captcha challenge is presented - if self.is_cloudflare_captcha_challenge(resp): - self.handle_captcha_challenge(resp, url) - # Check if Cloudflare anti-bot "I'm Under Attack Mode" is enabled if self.is_cloudflare_iuam_challenge(resp): resp = self.solve_cf_challenge(resp, **kwargs) return resp + def send(self, request, **kwargs): + resp = super(CloudflareScraper, self).send(request, **kwargs) + + while self.is_cloudflare_captcha_challenge(resp): + hook_resp = dispatch_hook('captcha', self.hooks, resp, **kwargs) + + # The captcha hooks are expected to return a different response + if resp is hook_resp or not hook_resp: + # No captcha hooks or the response is invalid + self.handle_captcha_challenge(resp) + + # Replace the response and check again + resp = hook_resp + + return resp + def cloudflare_is_bypassed(self, url, resp=None): cookie_domain = ".{}".format(urlparse(url).netloc) return ( @@ -136,10 +149,10 @@ def cloudflare_is_bypassed(self, url, resp=None): (resp and resp.cookies.get("cf_clearance", None, domain=cookie_domain)) ) - def handle_captcha_challenge(self, resp, url): + def handle_captcha_challenge(self, resp): error = ( "Cloudflare captcha challenge presented for %s (cfscrape cannot solve captchas)" - % urlparse(url).netloc + % urlparse(resp.url).netloc ) if ssl.OPENSSL_VERSION_NUMBER < 0x10101000: error += ". Your OpenSSL version is lower than 1.1.1. Please upgrade your OpenSSL library and recompile Python." diff --git a/tests/__init__.py b/tests/__init__.py index b3cd501..663defc 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -149,7 +149,9 @@ def on_redirect(request): responses.add(DefaultResponse(url=url, body=requested_page)) responses.add(RedirectResponse( - url=submit_uri, callback=on_redirect, location=redirect_to + url=submit_uri, + callback=on_redirect, + location=redirect_to )) return test(self, **cfscrape_kwargs) @@ -158,12 +160,22 @@ def on_redirect(request): return challenge_responses_decorator -def recaptcha_responses(filename): +def recaptcha_responses(filename, redirect_to='/'): def recaptcha_responses_decorator(test): @responses.activate def wrapper(self): responses.add(CaptchaResponse(url=url, body=fixtures(filename))) + def on_redirect(request): + # We don't register the last response unless the redirect occurs + responses.add(DefaultResponse(url=url, body=requested_page)) + + responses.add(RedirectResponse( + url='{}/cdn-cgi/l/chk_captcha'.format(url), + callback=on_redirect, + location=redirect_to + )) + return test(self, **cfscrape_kwargs) return wrapper diff --git a/tests/test_cfscrape.py b/tests/test_cfscrape.py index a3bc6ea..4682863 100644 --- a/tests/test_cfscrape.py +++ b/tests/test_cfscrape.py @@ -65,6 +65,26 @@ def test_cf_recaptcha_15_04_2019(self, **kwargs): finally: ssl.OPENSSL_VERSION_NUMBER = v + @recaptcha_responses(filename='cf_recaptcha_15_04_2019.html') + def test_captcha_hooks(self, **kwargs): + scraper = cfscrape.CloudflareScraper(**kwargs) + + def captcha_hook(resp, **kwargs): + return scraper.get('{}/cdn-cgi/l/chk_captcha'.format(url)) + + scraper.hooks['captcha'] = captcha_hook + + expect(scraper.get(url).content).to.equal(requested_page) + + @recaptcha_responses(filename='cf_recaptcha_15_04_2019.html') + def test_captcha_hooks_empty_response(self, **kwargs): + scraper = cfscrape.CloudflareScraper(**kwargs) + scraper.hooks['captcha'] = [lambda resp, **kwargs: None] + + message = re.compile(r'captcha challenge presented') + scraper.get.when.called_with(url) \ + .should.have.raised(cfscrape.CloudflareCaptchaError, message) + @responses.activate def test_js_challenge_unable_to_identify(self): body = fixtures('js_challenge_10_04_2019.html') @@ -323,5 +343,6 @@ def test_create_scraper_with_session(self): scraper.should_not.have.property('data') session.data = {'bar': 'foo'} + scraper = cfscrape.create_scraper(sess=session) scraper.data.should.equal(session.data)