-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathArchive_by_2049bbsBot.py
133 lines (107 loc) · 4.87 KB
/
Archive_by_2049bbsBot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import telebot
from datetime import datetime
import time
import urllib.parse
from archivenow import archivenow
import subprocess
import random
import csv
import requests
from bs4 import BeautifulSoup
from slimit import ast
from slimit.parser import Parser
from slimit.visitors import nodevisitor
import warnings
warnings.filterwarnings("ignore")
bot = telebot.TeleBot("TOKEN", threaded=False)
@bot.message_handler(
regexp="(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])")
def echo_all(message):
# 某些微信链接包含个人独特id,有泄露隐私风险
if message.text.startswith('https://mp.weixin.qq.com/') and '__biz=' in message.text:
url = '&'.join(message.text.split('&', 5)[:5])
else:
url = message.text
# 抓取网页
headers={"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36",
"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language" : "en-us",
"Connection" : "keep-alive",
"Accept-Charset" : "utf-8;q=0.7,*;q=0.7"}
html = requests.get(url, headers = headers)
soup = BeautifulSoup(html.text, "html.parser")
if url.startswith('https://mp.weixin.qq.com/'):
# 提取公众号名称
Official_Account = soup.find("a", {"id": "js_name"}).text.strip()
# 提取创建时间 ct: create_time
script = soup.find("script", text=lambda text: text and "var ct" in text)
parser = Parser()
tree = parser.parse(script.text)
for node in nodevisitor.visit(tree):
if isinstance(node, ast.VarDecl) and node.identifier.value == 'ct':
ct = node.initializer.value
ct = int(ct.strip('"'))
create_date = datetime.utcfromtimestamp(ct).strftime('%Y-%m-%d')
# 提取文章标题、作者和描述信息
def get_meta(soup, meta):
raw = soup.find("meta", property=meta)
meta = raw['content'] if raw else ""
return meta
title = get_meta(soup, "og:title")
author = get_meta(soup, "og:article:author")
description = get_meta(soup, "og:description")
elif "zhihu.com" in message.text:
Official_Account = "小肚鸡肠的知乎"
create_date = ""
title = soup.title.text.strip() + "-" + str(random.randrange(2, 50000000))
author = "知乎小管家去死"
description = "知乎删贴还不让别人存档。"
elif "weibo.com" in message.text:
bot.reply_to(message, "暂不支持微博页面抓取,请截图后保存至 https://t.me/sm_ms_bot")
else:
Official_Account = ""
create_date = ""
title = soup.title.text.strip()
author = ""
description = ""
# 调用系统命令 monolith,保存网页。需要系统已经安装 monolith。
subprocess.call(["monolith", url, '-o', '/srv/web/mono/' + title + '.html'])
# 将保存的网址返回,注意需要将中文 url 做编码,否则遇到特殊字符会识别错误
reply_url = 'http://206.189.252.32:8083/' + urllib.parse.quote(title) + '.html'
bot.reply_to(message, reply_url)
# 保存到 archive.org,archive.today
try:
reply_ia = archivenow.push(url, 'ia')[0]
bot.reply_to(message, reply_ia)
reply_is = archivenow.push(url, 'is')[0]
bot.reply_to(message, reply_is)
except Exception as e:
bot.reply_to(message, 'oooops, please send the url again.')
bot.reply_to(message, 'http://206.189.252.32:8085/')
reply_ia_link = '<a href="' + reply_ia + '" target="_blank">' + '备份3' + '</a>'
reply_is_link = '<a href="' + reply_is + '" target="_blank">' + '备份2' + '</a>'
monolith_link = '<a href="' + reply_url + '" target="_blank">' + '备份1' + '</a>'
message_link = '<a href="' + url + '" target="_blank">' + 'url' + '</a>'
with open('/srv/web/archive_web3/data/archive.csv', 'a') as csvfile:
fieldnames = ['提交时间', '帐号', '标题', '发布日期', '描述', '原始链接', '2049bbs','archive.today', 'archive.org']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow({'提交时间': time.ctime(),
'帐号': Official_Account,
'标题': title,
'发布日期': create_date,
'描述': description,
'原始链接': message_link,
'2049bbs': monolith_link,
'archive.today': reply_is_link,
'archive.org': reply_ia_link
})
# debug 环境
#bot.polling(none_stop=True, timeout=123)
# 生产环境
while True:
try:
bot.polling(none_stop=True, timeout=123)
except Exception as e:
logger = telebot.logger
logger.error(e)
time.sleep(15)