Skip to content

Commit

Permalink
feat: 实现api版本
Browse files Browse the repository at this point in the history
  • Loading branch information
shing-yu committed Nov 30, 2023
1 parent 00bac5f commit 53d80c1
Show file tree
Hide file tree
Showing 4 changed files with 254 additions and 87 deletions.
35 changes: 28 additions & 7 deletions src/api_edition/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@
无论您对程序进行了任何操作,请始终保留此信息。
"""

import re
import multiprocessing
import queue
import threading
from multiprocessing import Process, Manager
import time
import fanqie_api as fa
import qimao_api as fa
from flask import Flask, request, jsonify, make_response
from flask_cors import CORS
from flask_limiter import Limiter
Expand Down Expand Up @@ -87,7 +88,7 @@ def crawl(url):
with Manager() as manager:
return_dict = manager.dict()
# 创建一个新的进程来运行爬虫函数
p = Process(target=fa.fanqie_l, args=(url, 'utf-8', return_dict))
p = Process(target=fa.qimao_l, args=(url, 'utf-8', return_dict))
p.start()
p.join() # 等待进程结束
if 'error' in return_dict:
Expand Down Expand Up @@ -123,8 +124,9 @@ def start(self):

def add_url(self, url):
# 检查URL格式是否正确,如果不正确则返回错误信息,否则将URL添加到队列中并返回成功信息
if "/page/" not in url:
return "URL格式不正确,请重新输入"
if "/shuku/" not in url:
print(f"{url} URL格式不正确,内部错误")
return "URL格式不正确,内部错误", 500
else:
if url not in self.task_status or self.task_status[url] == "失败":
self.url_queue.put(url)
Expand All @@ -151,18 +153,37 @@ def api():
# 检查请求数据是否包含'action'和'id'字段,如果没有则返回418错误
if 'action' not in data or 'id' not in data:
return "Bad Request.The request is missing necessary json data.", 400
if data['id'].isdigit():
pass
else:
if '_0' in data['id']:
return "暂不支持此书籍(书籍ID中含有“_0”),请等待该书籍后续更新。"
if 'www.qimao.com/shuku' in data['id']:
# noinspection PyBroadException
try:
data['id'] = re.search(r"shuku/(\d+)", data['id']).group(1)
except Exception:
return "你输入的不是书籍ID或正确的链接。", 400
elif 'app-share.wtzw.com' in data['id']:
# noinspection PyBroadException
try:
data['id'] = re.search(r"article-detail/(\d+)", data['id']).group(1)
except Exception:
return "你输入的不是书籍ID或正确的链接。", 400
else:
return "你输入的不是书籍ID或正确的链接。", 400

# 如果'action'字段的值为'add',则尝试将URL添加到队列中,并返回相应的信息和位置
if data['action'] == 'add':
url = 'https://fanqienovel.com/page/' + data['id']
url = 'https://www.qimao.com/shuku/' + data['id'] + '/'
message = spider.add_url(url)
position = list(spider.url_queue.queue).index(url) + 1 if url in list(spider.url_queue.queue) else None
status = spider.task_status.get(url, None)
return jsonify({'message': message, 'position': position, 'status': status})

# 如果'action'字段的值为'query',则检查URL是否在队列中,并返回相应的信息和位置或不存在的信息
elif data['action'] == 'query':
url = 'https://fanqienovel.com/page/' + data['id']
url = 'https://www.qimao.com/shuku/' + data['id'] + '/'
position = list(spider.url_queue.queue).index(url) + 1 if url in list(spider.url_queue.queue) else None
status = spider.task_status.get(url, None)
return jsonify({'exists': status is not None, 'position': position, 'status': status})
Expand All @@ -173,4 +194,4 @@ def api():

if __name__ == "__main__":
multiprocessing.freeze_support()
app.run(host='0.0.0.0', port=5000)
app.run(host='0.0.0.0', port=5001)
94 changes: 94 additions & 0 deletions src/api_edition/get_bookinfo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@

# 开发者注意:
# 七猫网页在点击类名为:tab-inner 的”作品目录“按钮后
# 才会显示目录内容

import asyncio
import os
import public as p
from bs4 import BeautifulSoup
import re

# 设置镜像下载地址
os.environ["PYPPETEER_DOWNLOAD_HOST"] = "https://mirrors.huaweicloud.com"
from pyppeteer import launch # noqa: E402


async def get_book_info(url):
# 创建一个Pyppeteer的Browser实例
browser = await launch()

# 创建一个新的页面
page = await browser.newPage()

# 访问网页
await page.goto(url)

# 等待加载完成
await page.waitForSelector('.tab-inner')

# ==================== 获取简介 ====================

# 在获取目录前,先获取小说简介
html = await page.content()
soup = BeautifulSoup(html, "html.parser")
intro = soup.find('p', class_='intro').get_text().replace(' ', '\n')

# ==================== 获取简介结束 ====================

# 模拟点击目录按钮,切换网页内容
# 在页面上执行JavaScript代码,模拟点击目录
await page.evaluate('''() => {
var elements = document.getElementsByClassName('tab-inner');
for(var i=0; i<elements.length; i++){
elements[i].click();
}
}''')

# 等待页面加载
await asyncio.sleep(1)

# 获取网页源代码
html = await page.content()

# 解析网页源码
soup = BeautifulSoup(html, "html.parser")

# ==================== 获取标题 ====================

# 获取小说标题
title = soup.find('div', {'class': 'title clearfix'}).find('span', {'class': 'txt'}).text
# , class_ = "info-name"
# 替换非法字符
title = p.rename(title)

# ==================== 获取标题结束 ====================

# ==================== 获取信息 ====================

info_div = soup.find('div', class_='wrap-txt')

# 在每个div标签中,找到类为'btns-wrap clearfix'的div标签
btn = info_div.find('div', class_='btns-wrap clearfix')

# 如果找到了btn标签,就从原div标签中移除它
if btn is not None:
btn.extract()

# 获取剩余的文本
info_text = info_div.get_text()

# 使用re模块的sub函数,将text中的多个连续的空格替换为一个空格
info_text = re.sub(r'\s+', ' ', info_text)

info = info_text + '\n'

# ==================== 获取信息结束 ====================

# 匹配类名,找出目录标签,获取目录列表
chapters = soup.select('li[class^="clearfix ref-catalog-li-"]')

# 关闭Browser实例
await browser.close()

return {'intro': intro, 'title': title, 'info': info, 'chapters': chapters}
108 changes: 95 additions & 13 deletions src/api_edition/public.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,25 @@
本软件提供的是按"原样"提供的,没有任何明示或暗示的保证,包括但不限于适销性和特定用途的适用性。作者不对任何直接或间接损害或其他责任承担任何责任。在适用法律允许的最大范围内,作者明确放弃了所有明示或暗示的担保和条件。
免责声明:
该程序仅用于学习和研究Python网络爬虫和网页处理技术,不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险,均由用户自行承担,与作者和项目贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。
该程序仅用于学习和研究Python网络爬虫和网页处理技术,不得用于任何非法活动或侵犯他人权益的行为。使用本程序所产生的一切法律责任和风险,均由用户自行承担,与作者和项目协作者、贡献者无关。作者不对因使用该程序而导致的任何损失或损害承担任何责任。
请在使用本程序之前确保遵守相关法律法规和网站的使用政策,如有疑问,请咨询法律顾问。
无论您对程序进行了任何操作,请始终保留此信息。
"""

import re
import os
import sys
# pycrypto模块已不再使用,使用pycryptodome模块
# noinspection PyPackageRequirements
from Crypto.Cipher import AES
# noinspection PyPackageRequirements
from Crypto.Util.Padding import unpad
from base64 import b64decode
import requests
import hashlib
import random


# 替换非法字符
Expand All @@ -46,15 +57,86 @@ def rename(name):
return sanitized_path


def fix_publisher(text):
# 针对性去除所有 出版物 所携带的标签
text = re.sub(r'<p class=".*?">', '', text)
text = re.sub(r'<!--\?xml.*?>', '', text)
text = re.sub(r'<link .*?/>', '', text)
text = re.sub(r'<meta .*?/>', '', text)
text = re.sub(r'<h1 .*?>', '', text)
text = re.sub(r'<br/>', '', text)
text = re.sub(r'<!DOCTYPE html .*?>', '', text)
text = re.sub(r'<span .*?>', '', text)
text = re.sub(r'<html .*?>', '', text)
return text
# 定义解密函数
def decrypt(data, iv):
# print(f"Decrypting data: {data}")
# print(f"Using iv: {iv}")
key = bytes.fromhex('32343263636238323330643730396531')
iv = bytes.fromhex(iv)
cipher = AES.new(key, AES.MODE_CBC, iv=iv)
decrypted = unpad(cipher.decrypt(bytes.fromhex(data)), AES.block_size)
return decrypted.decode('utf-8')


# 定义qimao函数
def decrypt_qimao(content):
# print(f"Decrypting content: {content}")
txt = b64decode(content)
iv = txt[:16].hex()
# print(f"IV: {iv}")
fntxt = decrypt(txt[16:].hex(), iv).strip().replace('\n', '<br>')
return fntxt


def get_headers(book_id):

version_list = [
'73720', '73700',
'73620', '73600',
'73500',
'73420', '73400',
'73328', '73325', '73320', '73300',
'73220', '73200',
'73100', '73000', '72900',
'72820', '72800',
'70720', '62010', '62112',
]

random.seed(book_id)

version = random.choice(version_list)

headers = {
"AUTHORIZATION": "",
"app-version": f"{version}",
"application-id": "com.****.reader",
"channel": "unknown",
"net-env": "1",
"platform": "android",
"qm-params": "",
"reg": "0",
}

# 获取 headers 的所有键并排序
keys = sorted(headers.keys())

# 生成待签名的字符串
sign_str = ''.join([k + '=' + str(headers[k]) for k in keys]) + sign_key

# 生成签名
headers['sign'] = hashlib.md5(sign_str.encode()).hexdigest()

return headers


headers_dict = {}


def get_qimao(book_id, chapter_id, sign):
if book_id not in headers_dict:
headers_dict[book_id] = get_headers(book_id)
headers = headers_dict[book_id]
response = requests.get(f"https://api-ks.wtzw.com/api/v1/chapter/content?"
f"id={book_id}&chapterId={chapter_id}&sign={sign}",
headers=headers)
return response.json()


sign_key = 'd3dGiJc651gSQ8w1'


def asset_path(relative_path):
if hasattr(sys, '_MEIPASS'):
# noinspection PyProtectedMember
return os.path.join(sys._MEIPASS, relative_path)
return os.path.join(os.path.abspath("assets"), relative_path)
Loading

0 comments on commit 53d80c1

Please sign in to comment.