add fullarticle support

yongge007 · yongge007 · commit 48b5d875f037 · 2024-12-02T13:58:41.000+08:00
diff --git a/last_fullarticle_id.txt b/last_fullarticle_id.txt
@@ -0,0 +1 @@
+20241129165900633511510
diff --git a/my_fullarticle.py b/my_fullarticle.py
@@ -0,0 +1,127 @@
+import requests
+import datetime
+import os
+import json
+import time
+from parse_fullarticle import fetch_article_text
+
+# 改变工作目录到脚本所在的目录
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+# 读取配置文件
+with open('config.json', 'r') as config_file:
+    config = json.load(config_file)
+
+# 定义浏览器的 User-Agent 头
+headers = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
+    "Content-Type": "application/json"
+}
+
+def query_fullarticle_data_from_eastmoney(pageindex):
+    url = "https://i.eastmoney.com/api/guba/fullarticlelist"
+
+    # 获取当前时间的时间戳（秒）
+    current_timestamp = time.time()
+    # 将时间戳转换为毫秒
+    current_timestamp_ms = int(current_timestamp * 1000)
+
+    params = {
+        "pageindex": pageindex,
+        "uid": config['uid'],
+        "_": current_timestamp_ms
+    }
+    
+    response = requests.get(url, params=params, headers=headers)
+
+    if response.status_code == 200:
+        data = response.json()
+        reply_list = data.get('result').get('list')
+    else:
+        response.raise_for_status()
+
+    return reply_list
+
+def pull_fullarticle_data():
+    try:
+        with open('last_fullarticle_id.txt', 'r') as file:
+            last_fullarticle_id = int(file.read().strip())
+    except FileNotFoundError:
+        last_fullarticle_id = 0
+    except ValueError:
+        last_fullarticle_id = 0
+
+    new_fullarticle_list = []
+    
+    # 遍历回帖数据，找出最新的回帖保存到new_reply_list中
+    meet_old_record = False
+    for pageindex in range(1, 3):
+        if meet_old_record:
+            break
+        else:
+            fullarticle_list = query_fullarticle_data_from_eastmoney(pageindex)
+            
+            for item in fullarticle_list:
+                post_id = int(item.get('post_source_id')) 
+                if post_id > last_fullarticle_id:
+                    new_fullarticle_list.append(item)
+                else:
+                    meet_old_record = True
+                    break
+
+    # 将最新的回帖ID保存到文件中
+    if len(new_fullarticle_list) > 0:
+        latest_post_id = new_fullarticle_list[0].get('post_source_id')
+        with open('last_fullarticle_id.txt', 'w') as file:
+            file.write(str(latest_post_id))
+    else:
+        print(f"{datetime.datetime.now()} No new full article data found.")
+        print("-" * 40)
+
+    send_msg_to_feishu_bot(new_fullarticle_list)
+
+def send_msg_to_feishu_bot(new_fullarticle_list):
+    # post_id post_title post_content post_pic_url[] post_publish_time post_user->user_nickname post_guba->stockbar_name stockbar_code
+    for post in new_fullarticle_list:
+        post_id = post.get('post_source_id')
+        post_title = post.get('post_title')
+        post_content = post.get('post_content')
+        post_pic_url = post.get('post_pic_url')
+        post_publish_time = post.get('post_publish_time')
+        post_user_nickname = post.get('post_user').get('user_nickname')
+        post_guba_name = post.get('post_guba').get('stockbar_name')
+        post_guba_stockbar_code = post.get('post_guba').get('stockbar_code')
+
+        
+        url = f"https://caifuhao.eastmoney.com/news/{post_id}"
+        fullarticle_text = fetch_article_text(url)
+
+        msg = f"🌶🌶🌶长文更新：{post_id}\n帖子标题：{post_title}\n帖子内容摘要：{post_content}\n帖子图片：{post_pic_url}\n发布时间：{post_publish_time}\n发布用户：{post_user_nickname}\n股吧：{post_guba_name}({post_guba_stockbar_code})\n长文内容：{fullarticle_text}"
+
+        url = config['feishu_teamchat_bot_url']
+        data = {
+            "msg_type": "text",
+            "content": {
+                "text": msg
+            }
+        }
+        response = requests.post(url, json=data)
+
+        if response.status_code == 200:
+            print(f"{datetime.datetime.now()} {msg}")
+            print("-" * 40)
+        else:
+            response.raise_for_status()
+
+        # feishu_post_bot_url = config['feishu_post_bot_url']
+        # post_text = f"{post_id}#{post_title}#{post_content}#{post_pic_url}#{post_publish_time}#{post_user_nickname}#{post_guba_name}({post_guba_stockbar_code})"
+
+        # payload = {
+        #     "text": post_text
+        # }
+        # response = requests.post(feishu_post_bot_url, json=payload, headers=headers)
+        # if response.status_code != 200:
+        #     print(f"Failed to send message to Feishu Doc bot: {response.status_code}, {response.text}")
+
+if __name__ == "__main__":
+    pull_fullarticle_data()
diff --git a/parse_fullarticle.py b/parse_fullarticle.py
@@ -0,0 +1,54 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+
+def fetch_article_text(url):
+    # 发送 GET 请求获取页面内容
+    response = requests.get(url)
+
+    # 检查请求是否成功
+    if response.status_code == 200:
+        # 使用 BeautifulSoup 解析页面内容
+        soup = BeautifulSoup(response.text, 'html.parser')
+        
+        # 使用正则表达式查找包含 articleTxt 变量的 <script> 标签
+        script_tag = soup.find('script', string=re.compile(r'var articleTxt ='))
+
+        if script_tag:
+            # 提取 script 标签的内容
+            script_content = script_tag.string
+            
+            # 使用正则表达式提取 articleTxt 变量的内容
+            article_txt_match = re.search(r'var articleTxt = "(.*?)</div>"', script_content, re.DOTALL)
+            
+            if article_txt_match:
+                article_txt = article_txt_match.group(1)
+                
+                # 去除 <span> 和 </span> 标签
+                article_txt = re.sub(r'</?span.*?>', '', article_txt)
+
+                # 去除 <div> 和 </div> 标签
+                article_txt = re.sub(r'</?div.*?>', '', article_txt)
+                
+                # 将 <p> 和 </p> 标签替换为换行符
+                article_txt = re.sub(r'</?p.*?>', '\n', article_txt)
+                
+                # 移除 &nbsp;
+                article_txt = article_txt.replace('&nbsp;', '')
+                
+                return article_txt
+            else:
+                print("articleTxt 变量未找到")
+        else:
+            print("包含 articleTxt 变量的 <script> 标签未找到")
+    else:
+        print(f"请求失败，状态码: {response.status_code}")
+
+    return None
+
+# 示例调用
+if __name__ == "__main__":
+    url = "https://caifuhao.eastmoney.com/news/20241129165900633511510"
+    article_text = fetch_article_text(url)
+    if article_text:
+        print(article_text)