-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter.py
92 lines (78 loc) · 2.86 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import json
import os
import time
import requests
from bs4 import BeautifulSoup
from langchain_community.callbacks import OpenAICallbackHandler
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
# consts
URL_PREFIX = "https://www.jiqizhixin.com"
# input
text_prompt_template = """以下是一段 HTML 文本,包含了一些文章的标题和链接,请你完成以下任务,并按格式返回结果:
1. 提取文章的标题
2. 提取文章的链接,如果链接不完整,默认前缀是 {url_prefix}
3. 提前文章的封面图
注意:只需要返回结果,不需要对过程进行解释
返回格式:
{output_format}
HTML 文本:
```html
{html_article_list}
```
"""
# output
class ArticleListItem(BaseModel):
title: str = Field(..., description="该文章的标题")
url: str = Field(..., description="该文章的链接")
image_url: str = Field(..., description="该文章的封面图")
# chain
prompt_template = PromptTemplate.from_template(text_prompt_template)
output_format = PydanticOutputParser(
pydantic_object=ArticleListItem
).get_format_instructions()
prompt = prompt_template.partial(url_prefix=URL_PREFIX, output_format=output_format)
callback = OpenAICallbackHandler()
llm = ChatOpenAI(
base_url="https://console.siflow.cn/model-api",
model="simaas-qwen2-5-72b-instruct-v1",
temperature=0,
callbacks=[callback],
)
output_parser = JsonOutputParser()
chain = prompt | llm | output_parser
# prepare data
resp = requests.get(
URL_PREFIX,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
},
)
if not resp.ok:
print(resp.text)
exit(-1)
soup = BeautifulSoup(resp.text, "lxml")
div_article_list = soup.find("div", class_="js-article-container")
html_article_list = str(div_article_list)
# get summaries
start = time.perf_counter()
article_list = chain.invoke({"html_article_list": html_article_list})
end = time.perf_counter()
print(f"time cost: {end - start:.2f}s")
print(callback.prompt_tokens)
print(callback.completion_tokens)
# avoid repetition when write
url_articles_dict = dict()
if os.path.exists("articles.json"):
with open("articles.json", encoding="utf-8") as fh:
str_articles = fh.read()
if str_articles:
json_articles = json.loads(str_articles)
for article in json_articles:
url_articles_dict[article["url"]] = article
for article in article_list:
url_articles_dict[article["url"]] = article
with open("articles.json", "w", encoding="utf-8") as fh:
fh.write(json.dumps(list(url_articles_dict.values())))