Skip to content

Commit 48b5d87

Browse files
committed
add fullarticle support
1 parent 60ddeec commit 48b5d87

File tree

3 files changed

+182
-0
lines changed

3 files changed

+182
-0
lines changed

last_fullarticle_id.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
20241129165900633511510

my_fullarticle.py

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import requests
2+
import datetime
3+
import os
4+
import json
5+
import time
6+
from parse_fullarticle import fetch_article_text
7+
8+
# 改变工作目录到脚本所在的目录
9+
os.chdir(os.path.dirname(os.path.abspath(__file__)))
10+
11+
# 读取配置文件
12+
with open('config.json', 'r') as config_file:
13+
config = json.load(config_file)
14+
15+
# 定义浏览器的 User-Agent 头
16+
headers = {
17+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
18+
"Content-Type": "application/json"
19+
}
20+
21+
def query_fullarticle_data_from_eastmoney(pageindex):
22+
url = "https://i.eastmoney.com/api/guba/fullarticlelist"
23+
24+
# 获取当前时间的时间戳(秒)
25+
current_timestamp = time.time()
26+
# 将时间戳转换为毫秒
27+
current_timestamp_ms = int(current_timestamp * 1000)
28+
29+
params = {
30+
"pageindex": pageindex,
31+
"uid": config['uid'],
32+
"_": current_timestamp_ms
33+
}
34+
35+
response = requests.get(url, params=params, headers=headers)
36+
37+
if response.status_code == 200:
38+
data = response.json()
39+
reply_list = data.get('result').get('list')
40+
else:
41+
response.raise_for_status()
42+
43+
return reply_list
44+
45+
def pull_fullarticle_data():
46+
try:
47+
with open('last_fullarticle_id.txt', 'r') as file:
48+
last_fullarticle_id = int(file.read().strip())
49+
except FileNotFoundError:
50+
last_fullarticle_id = 0
51+
except ValueError:
52+
last_fullarticle_id = 0
53+
54+
new_fullarticle_list = []
55+
56+
# 遍历回帖数据,找出最新的回帖保存到new_reply_list中
57+
meet_old_record = False
58+
for pageindex in range(1, 3):
59+
if meet_old_record:
60+
break
61+
else:
62+
fullarticle_list = query_fullarticle_data_from_eastmoney(pageindex)
63+
64+
for item in fullarticle_list:
65+
post_id = int(item.get('post_source_id'))
66+
if post_id > last_fullarticle_id:
67+
new_fullarticle_list.append(item)
68+
else:
69+
meet_old_record = True
70+
break
71+
72+
# 将最新的回帖ID保存到文件中
73+
if len(new_fullarticle_list) > 0:
74+
latest_post_id = new_fullarticle_list[0].get('post_source_id')
75+
with open('last_fullarticle_id.txt', 'w') as file:
76+
file.write(str(latest_post_id))
77+
else:
78+
print(f"{datetime.datetime.now()} No new full article data found.")
79+
print("-" * 40)
80+
81+
send_msg_to_feishu_bot(new_fullarticle_list)
82+
83+
def send_msg_to_feishu_bot(new_fullarticle_list):
84+
# post_id post_title post_content post_pic_url[] post_publish_time post_user->user_nickname post_guba->stockbar_name stockbar_code
85+
for post in new_fullarticle_list:
86+
post_id = post.get('post_source_id')
87+
post_title = post.get('post_title')
88+
post_content = post.get('post_content')
89+
post_pic_url = post.get('post_pic_url')
90+
post_publish_time = post.get('post_publish_time')
91+
post_user_nickname = post.get('post_user').get('user_nickname')
92+
post_guba_name = post.get('post_guba').get('stockbar_name')
93+
post_guba_stockbar_code = post.get('post_guba').get('stockbar_code')
94+
95+
96+
url = f"https://caifuhao.eastmoney.com/news/{post_id}"
97+
fullarticle_text = fetch_article_text(url)
98+
99+
msg = f"🌶🌶🌶长文更新:{post_id}\n帖子标题:{post_title}\n帖子内容摘要:{post_content}\n帖子图片:{post_pic_url}\n发布时间:{post_publish_time}\n发布用户:{post_user_nickname}\n股吧:{post_guba_name}({post_guba_stockbar_code})\n长文内容:{fullarticle_text}"
100+
101+
url = config['feishu_teamchat_bot_url']
102+
data = {
103+
"msg_type": "text",
104+
"content": {
105+
"text": msg
106+
}
107+
}
108+
response = requests.post(url, json=data)
109+
110+
if response.status_code == 200:
111+
print(f"{datetime.datetime.now()} {msg}")
112+
print("-" * 40)
113+
else:
114+
response.raise_for_status()
115+
116+
# feishu_post_bot_url = config['feishu_post_bot_url']
117+
# post_text = f"{post_id}#{post_title}#{post_content}#{post_pic_url}#{post_publish_time}#{post_user_nickname}#{post_guba_name}({post_guba_stockbar_code})"
118+
119+
# payload = {
120+
# "text": post_text
121+
# }
122+
# response = requests.post(feishu_post_bot_url, json=payload, headers=headers)
123+
# if response.status_code != 200:
124+
# print(f"Failed to send message to Feishu Doc bot: {response.status_code}, {response.text}")
125+
126+
if __name__ == "__main__":
127+
pull_fullarticle_data()

parse_fullarticle.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import re
4+
5+
def fetch_article_text(url):
6+
# 发送 GET 请求获取页面内容
7+
response = requests.get(url)
8+
9+
# 检查请求是否成功
10+
if response.status_code == 200:
11+
# 使用 BeautifulSoup 解析页面内容
12+
soup = BeautifulSoup(response.text, 'html.parser')
13+
14+
# 使用正则表达式查找包含 articleTxt 变量的 <script> 标签
15+
script_tag = soup.find('script', string=re.compile(r'var articleTxt ='))
16+
17+
if script_tag:
18+
# 提取 script 标签的内容
19+
script_content = script_tag.string
20+
21+
# 使用正则表达式提取 articleTxt 变量的内容
22+
article_txt_match = re.search(r'var articleTxt = "(.*?)</div>"', script_content, re.DOTALL)
23+
24+
if article_txt_match:
25+
article_txt = article_txt_match.group(1)
26+
27+
# 去除 <span> 和 </span> 标签
28+
article_txt = re.sub(r'</?span.*?>', '', article_txt)
29+
30+
# 去除 <div> 和 </div> 标签
31+
article_txt = re.sub(r'</?div.*?>', '', article_txt)
32+
33+
# 将 <p> 和 </p> 标签替换为换行符
34+
article_txt = re.sub(r'</?p.*?>', '\n', article_txt)
35+
36+
# 移除 &nbsp;
37+
article_txt = article_txt.replace('&nbsp;', '')
38+
39+
return article_txt
40+
else:
41+
print("articleTxt 变量未找到")
42+
else:
43+
print("包含 articleTxt 变量的 <script> 标签未找到")
44+
else:
45+
print(f"请求失败,状态码: {response.status_code}")
46+
47+
return None
48+
49+
# 示例调用
50+
if __name__ == "__main__":
51+
url = "https://caifuhao.eastmoney.com/news/20241129165900633511510"
52+
article_text = fetch_article_text(url)
53+
if article_text:
54+
print(article_text)

0 commit comments

Comments
 (0)