Skip to content

Commit

Permalink
fix: add exponential backoff delay when fetching linovelib pages
Browse files Browse the repository at this point in the history
  • Loading branch information
wdpm committed Mar 18, 2024
1 parent ebad837 commit b503537
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 5 deletions.
120 changes: 120 additions & 0 deletions analyze/linovelib-mobile/sample.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
<html class="no-js" lang="en-US"><!--<![endif]-->
<head>\n<title>Access denied | www.bilinovel.com used Cloudflare to restrict access</title>\n
<meta charset="UTF-8">
\n
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
\n
<meta http-equiv="X-UA-Compatible" content="IE=Edge">
\n
<meta name="robots" content="noindex, nofollow">
\n
<meta name="viewport" content="width=device-width,initial-scale=1">
\n
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css">
\n\n\n
<script>\n(function () {
if (document.addEventListener && window.XMLHttpRequest && JSON && JSON.stringify) {
var e = function (a) {
var c = document.getElementById("error-feedback-survey"),
d = document.getElementById("error-feedback-success"), b = new XMLHttpRequest;
a = {event: "feedback clicked", properties: {errorCode: 1015, helpful: a, version: 1}};
b.open("POST", "https://sparrow.cloudflare.com/api/v1/event");
b.setRequestHeader("Content-Type", "application/json");
b.setRequestHeader("Sparrow-Source-Key", "c771f0e4b54944bebf4261d44bd79a1e");\nb.send(JSON.stringify(a));
c.classList.add("feedback-hidden");
d.classList.remove("feedback-hidden")
};
document.addEventListener("DOMContentLoaded", function () {
var a = document.getElementById("error-feedback"), c = document.getElementById("feedback-button-yes"),
d = document.getElementById("feedback-button-no");
"classList" in a && (a.classList.remove("feedback-hidden"), c.addEventListener("click", function () {
e(!0)
}), d.addEventListener("click", function () {
e(!1)
}))
})
}
})();\n</script>
\n\n
<script defer="" src="https://performance.radar.cloudflare.com/beacon.js"></script>
\n
</head>
\n
<body>\n
<div id="cf-wrapper">\n
<div class="cf-alert cf-alert-error cf-cookie-error hidden" id="cookie-alert" data-translate="enable_cookies">Please
enable cookies.
</div>
\n
<div id="cf-error-details" class="p-0">\n
<header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-15 antialiased">\n <h1
class="inline-block md:block mr-2 md:mb-2 font-light text-60 md:text-3xl text-black-dark leading-tight">
\n <span data-translate="error">Error</span>\n <span>1015</span>\n </h1>\n <span
class="inline-block md:block heading-ray-id font-mono text-15 lg:text-sm lg:leading-relaxed">Ray ID: 8661c41c69e78cda •</span>\n
<span class="inline-block md:block heading-ray-id font-mono text-15 lg:text-sm lg:leading-relaxed">2024-03-18 02:24:22 UTC</span>\n
<h2 class="text-gray-600 leading-1.3 text-3xl lg:text-2xl font-light">You are being rate limited</h2>\n
</header>
\n\n
<section class="w-240 lg:w-full mx-auto mb-8 lg:px-8">\n
<div id="what-happened-section" class="w-1/2 md:w-full">\n <h2
class="text-3xl leading-tight font-normal mb-4 text-black-dark antialiased"
data-translate="what_happened">What happened?</h2>\n <p>The owner of this website
(www.bilinovel.com) has banned you temporarily from accessing this website.</p>\n \n
</div>
\n\n \n
</section>
\n\n
<div class="py-8 text-center" id="error-feedback">\n
<div id="error-feedback-survey" class="footer-line-wrapper">\n Was this page helpful?\n
<button class="border border-solid bg-white cf-button cursor-pointer ml-4 px-4 py-2 rounded"
id="feedback-button-yes" type="button">Yes
</button>
\n
<button class="border border-solid bg-white cf-button cursor-pointer ml-4 px-4 py-2 rounded"
id="feedback-button-no" type="button">No
</button>
\n
</div>
\n
<div class="feedback-success feedback-hidden" id="error-feedback-success">\n Thank you for your
feedback!\n
</div>
\n
</div>
\n\n\n
<div class="cf-error-footer cf-wrapper w-240 lg:w-full py-10 sm:py-4 sm:px-8 mx-auto text-center sm:text-left border-solid border-0 border-t border-gray-300">
\n
<p class="text-13">\n <span class="cf-footer-item sm:block sm:mb-1">Cloudflare Ray ID: <strong
class="font-semibold">8661c41c69e78cda</strong></span>\n <span
class="cf-footer-separator sm:hidden"></span>\n <span id="cf-footer-item-ip"
class="cf-footer-item sm:block sm:mb-1">\n Your IP:\n <button
type="button" id="cf-footer-ip-reveal" class="cf-footer-ip-reveal-btn">Click to reveal</button>\n <span
class="hidden" id="cf-footer-ip">198.98.54.160</span>\n <span
class="cf-footer-separator sm:hidden"></span>\n </span>\n <span
class="cf-footer-item sm:block sm:mb-1"><span>Performance &amp; security by</span> <a
rel="noopener noreferrer" href="https://www.cloudflare.com/5xx-error-landing" id="brand_link"
target="_blank">Cloudflare</a></span>\n \n
</p>
\n
<script>(function () {
function d() {
var b = a.getElementById("cf-footer-item-ip"), c = a.getElementById("cf-footer-ip-reveal");
b && "classList" in b && (b.classList.remove("hidden"), c.addEventListener("click", function () {
c.classList.add("hidden");
a.getElementById("cf-footer-ip").classList.remove("hidden")
}))
}

var a = document;
document.addEventListener && a.addEventListener("DOMContentLoaded", d)
})();</script>
\n
</div><!-- /.error-footer -->\n\n\n
</div><!-- /#cf-error-details -->\n
</div><!-- /#cf-wrapper -->\n\n
<script>\n
window._cf_translation = {};\n \n \n</script>
\n\n\n\n<span style="display: none !important;"><img width="0" height="0" hidden="" referrerpolicy="no-referrer"
src="https://fastly.cedexis-test.com/img/20367/r20-100KB.png?r=42951627"
style="display: none !important;"></span></body>
</html>
5 changes: 5 additions & 0 deletions src/linovelib2epub/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,8 @@ class LinovelibException(Exception):
Base exception class for Linovelib2epub library.
"""
pass

class PageContentIllegalException(LinovelibException):
def __init__(self, message="Page content is illegal."):
self.message = message
super().__init__(self.message)
46 changes: 41 additions & 5 deletions src/linovelib2epub/spider/linovelib_mobile_spider.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import random
import re
import time
from typing import Dict, List, Optional
Expand All @@ -12,7 +13,7 @@

from . import BaseNovelWebsiteSpider
from .linovelib_mobile_rules import generate_mapping_result
from ..exceptions import LinovelibException
from ..exceptions import LinovelibException, PageContentIllegalException
from ..models import LightNovel, LightNovelChapter, LightNovelVolume, LightNovelImage, CatalogLinovelibMobileChapter, \
CatalogLinovelibMobileVolume
from ..utils import (cookiedict_from_str, create_folder_if_not_exists,
Expand Down Expand Up @@ -174,8 +175,8 @@ def _sanitize_html(html: BeautifulSoup) -> str:
chapter_illustrations: List[LightNovelImage] = []
self.logger.info(f'chapter : {chapter_title}')

# 这个函数是含有状态的,必须及时覆盖 url_next 变量,否则状态机会失败
# 注意:由于特定一章的分页假设不会太多,因此这里不应用 chapter_crawl_delay 参数延迟
# 这个函数是含有状态的,必须及时覆盖 url_next 变量,否则状态机会失败
# 注意:由于这里并不关心页面内容是否正常,只收集页面链接,因此这里暂时不需要应用请求间隔延迟
url_next = self._expand_paginated_chapter_links(catalog_chapter, url_next)

# for loop [chapter_index_url]+[all paginated chapters] links of one chapter
Expand Down Expand Up @@ -259,10 +260,45 @@ def _fetch_page(self, url: str, max_retries: int = 5) -> str | None:
try:
driver.get(url)
html = driver.page_source

# Determine whether the content of the page has the following tags:
# - You are being rate limited
# - 抱歉,章节内容不支持该浏览器显示
failed_patterns = ['You are being rate limited', '抱歉,章节内容不支持该浏览器显示']
for pattern in failed_patterns:
# 使用正则表达式匹配页面内容
match = re.search(pattern, html)
if match:
raise PageContentIllegalException(f'The page content of {url} is not desired.')

return html
except PageContentIllegalException as e:
self.logger.warn(f"{e.message}")
except Exception as e:
request_count += 1
self.logger.warn(f"{url} encountered {e.__class__.__name__}, retrying ({request_count}/{max_retries})...")
self.logger.warn(f"{url} encountered {e.__class__.__name__}.")

request_count += 1
# 指数退避参考 https://cloud.google.com/memorystore/docs/redis/exponential-backoff?hl=zh-cn#example_algorithm
# 具体逻辑:
# 1.向服务器特定API发出请求。
# 2.如果请求失败,请等待 1 + random_number_milliseconds 秒后再重试请求。
# 3.如果请求失败,请等待 2 + random_number_milliseconds 秒后再重试请求。
# 4.如果请求失败,请等待 4 + random_number_milliseconds 秒后再重试请求。
# 5.依此类推,等待时间上限为 maximum_backoff。
# 等待时间达到上限后,您可以继续等待并重试,直到达到重试次数上限(但接下来的重试操作不会增加各次重试之间的等待时间)。

# 等待时间为 min(((2^n)+random_number_seconds), maximum_backoff),其中,n 会在每次迭代(请求)后增加 1。
# 其中:
# - random_number_seconds 是小于1的秒数(随机值)。
# - maximum_backoff 设置为一个较大的容忍值,这里设置为10s。这是基于经验的估计。
n = request_count
random_number_seconds = round(random.uniform(0, 1), 2) # 0.01-0.99s
maximum_backoff = 10
retry_interval = min(round(((2 ** (n - 1)) + random_number_seconds), 2), maximum_backoff)

self.logger.warning(
f'Retrying {url}({request_count}/{max_retries})...; retry_interval: {retry_interval}(s)')
time.sleep(retry_interval)

return None

Expand Down

0 comments on commit b503537

Please sign in to comment.