feat: 实现api版本

shing-yu · Nov 30, 2023 · 53d80c1 · 53d80c1
1 parent 00bac5f
commit 53d80c1
Show file tree

Hide file tree

Showing 4 changed files with 254 additions and 87 deletions.
diff --git a/src/api_edition/api.py b/src/api_edition/api.py
@@ -20,12 +20,13 @@
 无论您对程序进行了任何操作，请始终保留此信息。
 """
 
+import re
 import multiprocessing
 import queue
 import threading
 from multiprocessing import Process, Manager
 import time
-import fanqie_api as fa
+import qimao_api as fa
 from flask import Flask, request, jsonify, make_response
 from flask_cors import CORS
 from flask_limiter import Limiter
@@ -87,7 +88,7 @@ def crawl(url):
             with Manager() as manager:
                 return_dict = manager.dict()
                 # 创建一个新的进程来运行爬虫函数
-                p = Process(target=fa.fanqie_l, args=(url, 'utf-8', return_dict))
+                p = Process(target=fa.qimao_l, args=(url, 'utf-8', return_dict))
                 p.start()
                 p.join()  # 等待进程结束
                 if 'error' in return_dict:
@@ -123,8 +124,9 @@ def start(self):
 
     def add_url(self, url):
         # 检查URL格式是否正确，如果不正确则返回错误信息，否则将URL添加到队列中并返回成功信息
-        if "/page/" not in url:
-            return "URL格式不正确，请重新输入"
+        if "/shuku/" not in url:
+            print(f"{url} URL格式不正确，内部错误")
+            return "URL格式不正确，内部错误", 500
         else:
             if url not in self.task_status or self.task_status[url] == "失败":
                 self.url_queue.put(url)
@@ -151,18 +153,37 @@ def api():
     # 检查请求数据是否包含'action'和'id'字段，如果没有则返回418错误
     if 'action' not in data or 'id' not in data:
         return "Bad Request.The request is missing necessary json data.", 400
+    if data['id'].isdigit():
+        pass
+    else:
+        if '_0' in data['id']:
+            return "暂不支持此书籍（书籍ID中含有“_0”），请等待该书籍后续更新。"
+        if 'www.qimao.com/shuku' in data['id']:
+            # noinspection PyBroadException
+            try:
+                data['id'] = re.search(r"shuku/(\d+)", data['id']).group(1)
+            except Exception:
+                return "你输入的不是书籍ID或正确的链接。", 400
+        elif 'app-share.wtzw.com' in data['id']:
+            # noinspection PyBroadException
+            try:
+                data['id'] = re.search(r"article-detail/(\d+)", data['id']).group(1)
+            except Exception:
+                return "你输入的不是书籍ID或正确的链接。", 400
+        else:
+            return "你输入的不是书籍ID或正确的链接。", 400
 
     # 如果'action'字段的值为'add'，则尝试将URL添加到队列中，并返回相应的信息和位置
     if data['action'] == 'add':
-        url = 'https://fanqienovel.com/page/' + data['id']
+        url = 'https://www.qimao.com/shuku/' + data['id'] + '/'
         message = spider.add_url(url)
         position = list(spider.url_queue.queue).index(url) + 1 if url in list(spider.url_queue.queue) else None
         status = spider.task_status.get(url, None)
         return jsonify({'message': message, 'position': position, 'status': status})
 
     # 如果'action'字段的值为'query'，则检查URL是否在队列中，并返回相应的信息和位置或不存在的信息
     elif data['action'] == 'query':
-        url = 'https://fanqienovel.com/page/' + data['id']
+        url = 'https://www.qimao.com/shuku/' + data['id'] + '/'
         position = list(spider.url_queue.queue).index(url) + 1 if url in list(spider.url_queue.queue) else None
         status = spider.task_status.get(url, None)
         return jsonify({'exists': status is not None, 'position': position, 'status': status})
@@ -173,4 +194,4 @@ def api():
 
 if __name__ == "__main__":
     multiprocessing.freeze_support()
-    app.run(host='0.0.0.0', port=5000)
+    app.run(host='0.0.0.0', port=5001)
diff --git a/src/api_edition/get_bookinfo.py b/src/api_edition/get_bookinfo.py
@@ -0,0 +1,94 @@
+
+# 开发者注意:
+# 七猫网页在点击类名为：tab-inner 的”作品目录“按钮后
+# 才会显示目录内容
+
+import asyncio
+import os
+import public as p
+from bs4 import BeautifulSoup
+import re
+
+# 设置镜像下载地址
+os.environ["PYPPETEER_DOWNLOAD_HOST"] = "https://mirrors.huaweicloud.com"
+from pyppeteer import launch  # noqa: E402
+
+
+async def get_book_info(url):
+    # 创建一个Pyppeteer的Browser实例
+    browser = await launch()
+
+    # 创建一个新的页面
+    page = await browser.newPage()
+
+    # 访问网页
+    await page.goto(url)
+
+    # 等待加载完成
+    await page.waitForSelector('.tab-inner')
+
+# ==================== 获取简介 ====================
+
+    # 在获取目录前，先获取小说简介
+    html = await page.content()
+    soup = BeautifulSoup(html, "html.parser")
+    intro = soup.find('p', class_='intro').get_text().replace(' ', '\n')
+
+# ==================== 获取简介结束 ====================
+
+    # 模拟点击目录按钮，切换网页内容
+    # 在页面上执行JavaScript代码，模拟点击目录
+    await page.evaluate('''() => {
+        var elements = document.getElementsByClassName('tab-inner');
+        for(var i=0; i<elements.length; i++){
+            elements[i].click();
+        }
+    }''')
+
+    # 等待页面加载
+    await asyncio.sleep(1)
+
+    # 获取网页源代码
+    html = await page.content()
+
+    # 解析网页源码
+    soup = BeautifulSoup(html, "html.parser")
+
+# ==================== 获取标题 ====================
+
+    # 获取小说标题
+    title = soup.find('div', {'class': 'title clearfix'}).find('span', {'class': 'txt'}).text
+    # , class_ = "info-name"
+    # 替换非法字符
+    title = p.rename(title)
+
+# ==================== 获取标题结束 ====================
+
+# ==================== 获取信息 ====================
+
+    info_div = soup.find('div', class_='wrap-txt')
+
+    # 在每个div标签中，找到类为'btns-wrap clearfix'的div标签
+    btn = info_div.find('div', class_='btns-wrap clearfix')
+
+    # 如果找到了btn标签，就从原div标签中移除它
+    if btn is not None:
+        btn.extract()
+
+    # 获取剩余的文本
+    info_text = info_div.get_text()
+
+    # 使用re模块的sub函数，将text中的多个连续的空格替换为一个空格
+    info_text = re.sub(r'\s+', ' ', info_text)
+
+    info = info_text + '\n'
+
+# ==================== 获取信息结束 ====================
+
+    # 匹配类名，找出目录标签，获取目录列表
+    chapters = soup.select('li[class^="clearfix ref-catalog-li-"]')
+
+    # 关闭Browser实例
+    await browser.close()
+
+    return {'intro': intro, 'title': title, 'info': info, 'chapters': chapters}
diff --git a/src/api_edition/public.py b/src/api_edition/public.py
@@ -13,14 +13,25 @@
 本软件提供的是按"原样"提供的，没有任何明示或暗示的保证，包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内，作者明确放弃了所有明示或暗示的担保和条件。
 
 免责声明：
-该程序仅用于学习和研究Python网络爬虫和网页处理技术，不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险，均由用户自行承担，与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。
+该程序仅用于学习和研究Python网络爬虫和网页处理技术，不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险，均由用户自行承担，与作者和项目协作者、贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。
 
 请在使用本程序之前确保遵守相关法律法规和网站的使用政策，如有疑问，请咨询法律顾问。
 
 无论您对程序进行了任何操作，请始终保留此信息。
 """
 
 import re
+import os
+import sys
+# pycrypto模块已不再使用，使用pycryptodome模块
+# noinspection PyPackageRequirements
+from Crypto.Cipher import AES
+# noinspection PyPackageRequirements
+from Crypto.Util.Padding import unpad
+from base64 import b64decode
+import requests
+import hashlib
+import random
 
 
 # 替换非法字符
@@ -46,15 +57,86 @@ def rename(name):
     return sanitized_path
 
 
-def fix_publisher(text):
-    # 针对性去除所有 出版物 所携带的标签
-    text = re.sub(r'<p class=".*?">', '', text)
-    text = re.sub(r'<!--\?xml.*?>', '', text)
-    text = re.sub(r'<link .*?/>', '', text)
-    text = re.sub(r'<meta .*?/>', '', text)
-    text = re.sub(r'<h1 .*?>', '', text)
-    text = re.sub(r'<br/>', '', text)
-    text = re.sub(r'<!DOCTYPE html .*?>', '', text)
-    text = re.sub(r'<span .*?>', '', text)
-    text = re.sub(r'<html .*?>', '', text)
-    return text
+# 定义解密函数
+def decrypt(data, iv):
+    # print(f"Decrypting data: {data}")
+    # print(f"Using iv: {iv}")
+    key = bytes.fromhex('32343263636238323330643730396531')
+    iv = bytes.fromhex(iv)
+    cipher = AES.new(key, AES.MODE_CBC, iv=iv)
+    decrypted = unpad(cipher.decrypt(bytes.fromhex(data)), AES.block_size)
+    return decrypted.decode('utf-8')
+
+
+# 定义qimao函数
+def decrypt_qimao(content):
+    # print(f"Decrypting content: {content}")
+    txt = b64decode(content)
+    iv = txt[:16].hex()
+    # print(f"IV: {iv}")
+    fntxt = decrypt(txt[16:].hex(), iv).strip().replace('\n', '<br>')
+    return fntxt
+
+
+def get_headers(book_id):
+
+    version_list = [
+        '73720', '73700',
+        '73620', '73600',
+        '73500',
+        '73420', '73400',
+        '73328', '73325', '73320', '73300',
+        '73220', '73200',
+        '73100', '73000', '72900',
+        '72820', '72800',
+        '70720', '62010', '62112',
+    ]
+
+    random.seed(book_id)
+
+    version = random.choice(version_list)
+
+    headers = {
+        "AUTHORIZATION": "",
+        "app-version": f"{version}",
+        "application-id": "com.****.reader",
+        "channel": "unknown",
+        "net-env": "1",
+        "platform": "android",
+        "qm-params": "",
+        "reg": "0",
+    }
+
+    # 获取 headers 的所有键并排序
+    keys = sorted(headers.keys())
+
+    # 生成待签名的字符串
+    sign_str = ''.join([k + '=' + str(headers[k]) for k in keys]) + sign_key
+
+    # 生成签名
+    headers['sign'] = hashlib.md5(sign_str.encode()).hexdigest()
+
+    return headers
+
+
+headers_dict = {}
+
+
+def get_qimao(book_id, chapter_id, sign):
+    if book_id not in headers_dict:
+        headers_dict[book_id] = get_headers(book_id)
+    headers = headers_dict[book_id]
+    response = requests.get(f"https://api-ks.wtzw.com/api/v1/chapter/content?"
+                            f"id={book_id}&chapterId={chapter_id}&sign={sign}",
+                            headers=headers)
+    return response.json()
+
+
+sign_key = 'd3dGiJc651gSQ8w1'
+
+
+def asset_path(relative_path):
+    if hasattr(sys, '_MEIPASS'):
+        # noinspection PyProtectedMember
+        return os.path.join(sys._MEIPASS, relative_path)
+    return os.path.join(os.path.abspath("assets"), relative_path)