Fix bug in PDF URL detection

yym68686 · Sep 17, 2023 · 13f4fb8 · 13f4fb8
1 parent 53effa7
commit 13f4fb8
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 32 deletions.
diff --git a/agent.py b/agent.py
@@ -305,8 +305,8 @@ def search_summary(result, model=config.DEFAULT_SEARCH_MODEL, temperature=config
     engans_ddg = en_ddg_search_thread.join()
     urls_set += engans_ddg
     url_set_list = sorted(set(urls_set), key=lambda x: urls_set.index(x))
-    url_pdf_set_list = [item for item in url_set_list if "pdf" in item]
-    url_set_list = [item for item in url_set_list if "pdf" not in item]
+    url_pdf_set_list = [item for item in url_set_list if item.endswith(".pdf")]
+    url_set_list = [item for item in url_set_list if not item.endswith(".pdf")]
 
     pdf_result = ""
     pdf_threads = []
@@ -403,7 +403,8 @@ def search_summary(result, model=config.DEFAULT_SEARCH_MODEL, temperature=config
     # for i in search_summary("Has the United States won the china US trade war？"):
     # for i in search_summary("What does 'n+2' mean in Huawei's 'Mate 60 Pro' chipset? Please conduct in-depth analysis."):
     # for i in search_summary("AUTOMATIC1111 是什么？"):
-    for i in search_summary("中国利用外资指标下降了 87% ？真的假的。"):
+    for i in search_summary("python telegram bot 怎么接收pdf文件"):
+    # for i in search_summary("中国利用外资指标下降了 87% ？真的假的。"):
     # for i in search_summary("How much does the 'zeabur' software service cost per month? Is it free to use? Any limitations?"):
     # for i in search_summary("英国脱欧没有好处，为什么英国人还是要脱欧？"):
     # for i in search_summary("2022年俄乌战争为什么发生？"):

diff --git a/test/test_Web_crawler.py b/test/test_Web_crawler.py
@@ -1,3 +1,4 @@
+import re
 import os
 os.system('cls' if os.name == 'nt' else 'clear')
 import time
@@ -24,33 +25,43 @@ def Web_crawler(url: str) -> str:
         print('\033[0m')
     return result
 
-# def Web_crawler(url: str) -> str:
-#     """返回链接网址url正文内容，必须是合法的网址"""
-#     headers = {
-#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
-#     }
-#     result = ''
-#     try:
-#         requests.packages.urllib3.disable_warnings()
-#         response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True)
-#         content_length = int(response.headers.get('Content-Length', 0))
-#         if content_length > 500000:
-#             print("Skipping large file:", url)
-#             return result
-#         content = response.content
-#         detected_encoding = chardet.detect(response.content)['encoding']
-#         decoded_content = response.content.decode(detected_encoding, errors='replace')
-#         # soup = BeautifulSoup(response.text, 'html.parser')
-#         soup = BeautifulSoup(decoded_content, 'lxml')
-#         # soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')
-#         body = "".join(soup.find('body').get_text().split('\n'))
-#         result = body
-#     except Exception as e:
-#         print('\033[31m')
-#         print("error url", url)
-#         print("error", e)
-#         print('\033[0m')
-#     return result
+def Web_crawler(url: str) -> str:
+    """返回链接网址url正文内容，必须是合法的网址"""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+    }
+    result = ''
+    try:
+        requests.packages.urllib3.disable_warnings()
+        response = requests.get(url, headers=headers, verify=False, timeout=5, stream=True)
+        content_length = int(response.headers.get('Content-Length', 0))
+        if content_length > 500000:
+            print("Skipping large file:", url)
+            return result
+        # detected_encoding = chardet.detect(response.content)['encoding']
+        # decoded_content = response.content.decode(detected_encoding, errors='replace')
+        # # soup = BeautifulSoup(response.text, 'html.parser')
+        # soup = BeautifulSoup(decoded_content, 'lxml')
+        # # soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')
+        # body = "".join(soup.find('body').get_text().split('\n'))
+        # result = body
+
+        detected_encoding = chardet.detect(response.content)['encoding']
+        decoded_content = response.content.decode(detected_encoding, errors='ignore')
+        decoded_content = re.sub(r'[^\u0000-\uFFFF]', ' ', decoded_content)
+        soup = BeautifulSoup(decoded_content, 'lxml')
+        body = soup.find('body').get_text()
+        body = body.replace('\n', ' ')
+        body = re.sub(r'http[s]?://\S+', ' ', body)
+        body = re.sub(r'\s+', ' ', body)
+        result = body
+
+    except Exception as e:
+        print('\033[31m')
+        print("error url", url)
+        print("error", e)
+        print('\033[0m')
+    return result
 
 # def Web_crawler(url: str) -> str:
 #     """返回链接网址url正文内容，必须是合法的网址"""
@@ -80,8 +91,8 @@ def Web_crawler(url: str) -> str:
 # for url in ['https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/403', 'https://www.hostinger.com/tutorials/what-is-403-forbidden-error-and-how-to-fix-it', 'https://beebom.com/what-is-403-forbidden-error-how-to-fix/']:
 # for url in ['https://www.lifewire.com/403-forbidden-error-explained-2617989']:
 # for url in ['https://www.usnews.com/news/best-countries/articles/2022-02-24/explainer-why-did-russia-invade-ukraine']:
-# for url in ['https://zhidao.baidu.com/question/317577832.html']:
-for url in ['https://www.cnn.com/2023/09/06/tech/huawei-mate-60-pro-phone/index.html']:
+for url in ['https://zhidao.baidu.com/question/317577832.html']:
+# for url in ['https://www.cnn.com/2023/09/06/tech/huawei-mate-60-pro-phone/index.html']:
 # for url in ['https://www.reddit.com/r/China_irl/comments/15qojkh/46%E6%9C%88%E5%A4%96%E8%B5%84%E5%AF%B9%E4%B8%AD%E5%9B%BD%E7%9B%B4%E6%8E%A5%E6%8A%95%E8%B5%84%E5%87%8F87/', 'https://www.apple.com.cn/job-creation/Apple_China_CSR_Report_2020.pdf', 'https://hdr.undp.org/system/files/documents/hdr2013chpdf.pdf']:
 # for url in ['https://www.airuniversity.af.edu/JIPA/Display/Article/3111127/the-uschina-trade-war-vietnam-emerges-as-the-greatest-winner/']:
 # for url in ['https://zhuanlan.zhihu.com/p/646786536', 'https://zh.wikipedia.org/wiki/%E4%BF%84%E7%BE%85%E6%96%AF%E5%85%A5%E4%BE%B5%E7%83%8F%E5%85%8B%E8%98%AD', 'https://stock.finance.sina.com.cn/usstock/quotes/aapl.html']: