-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
182 lines (157 loc) · 9.09 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import json
import requests, pymysql
from fake_useragent import UserAgent
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import concurrent
from setuptools.wheel import Wheel
import threading
from time import sleep
import logging, random
uas = UserAgent()
mutex = threading.Lock()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) # 设置日志级别
file_handler = logging.FileHandler('example.log') # 创建一个文件处理器,用于将日志写入文件
file_handler.setLevel(logging.DEBUG) # 设置文件的日志级别
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') # 创建一个formatter并设置日志格式
file_handler.setFormatter(formatter) # 将formatter设置到handler上
logger.addHandler(file_handler) # 将处理器添加到记录器中
logger.info('任务启动')
def spider_(author_id, cookies):
url = 'https://www.kuaishou.com/graphql'
headers = {
"accept": "*/*",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "zh-CN,zh;q=0.9",
"connection": "keep-alive",
"content-length": "1784",
"content-type": "application/json",
"cookie": cookies[random.randint(0,4)],
"host": "www.kuaishou.com",
"origin": "https://www.kuaishou.com",
"referer": "https://www.kuaishou.com/profile/3xuj7uqrg43yw3w",
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": uas.random
}
data = {
"operationName": "visionProfilePhotoList",
"variables": {
"userId": author_id,
"pcursor": "",
"page": "profile"
},
"query": "fragment photoContent on PhotoEntity {\n __typename\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n commentCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n riskTagContent\n riskTagUrl\n}\n\nfragment recoPhotoFragment on recoPhotoEntity {\n __typename\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n commentCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n riskTagContent\n riskTagUrl\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n ...photoContent\n ...recoPhotoFragment\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n tags {\n type\n name\n __typename\n }\n __typename\n}\n\nquery visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n ...feedContent\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n"
}
while True:
response = requests.post(url, json=data, headers=headers).json()
pcursor = response["data"]["visionProfilePhotoList"]["pcursor"] # 翻页
data["variables"]["pcursor"] = str(pcursor)
data_list = response["data"]["visionProfilePhotoList"]["feeds"]
# print(data_list)
# input()
if len(data_list) != 0:
for photo in data_list:
caption = photo["photo"]["caption"]
likeCount = photo["photo"]["realLikeCount"]
photoUrl = photo["photo"]["photoUrl"]
video_id = photo["photo"]["id"]
durations = photo["photo"]["duration"]
duration = float(int(durations) / 1000) # 获取影片时常
minute = float(duration) // 60
second = float(duration) % 60
dura = f'{str(int(minute))}:{str(int(second))}'
date_time = photo["photo"]["timestamp"] # 影片发布时间
dt_object = datetime.fromtimestamp(date_time / 1000) # 13位时间戳进行转换
formatted_date = dt_object.strftime('%Y-%m-%d %H:%M:%S')
threshold = 3 * 24 * 60 * 60 # 设置三天前得时间
now = datetime.now().timestamp() # 获取当前时间
# 打印当前日期和时间到分钟
new_now = datetime.now()
now_date = new_now.strftime('%Y-%m-%d %H:%M:%S')
name = photo["author"]["name"] # 达人名字
# author_id = photo["author"]["name"] # id
tags = photo["tags"] # 标签
manifest = photo["photo"]["manifest"]["adaptationSet"][0]["representation"][0]["qualityType"] # 分辨率大小
new_tag = ''
if tags != None:
for tag in tags:
t = tag["name"]
# print(t)
new_tag += '#' + t
logger.info(photo["photo"]["profileUserTopPhoto"])
if now - (date_time / 1000) > threshold: # 进行三天时间判断
if photo["photo"]["profileUserTopPhoto"] == 'true': # 判断视频是否为置顶
print('置顶视频跳过')
else:
print('超过三天退出')
return '超过三天退出'
else:
mutex.acquire()
sql = f"SELECT * FROM `zt_ks` WHERE video_id='{video_id}'"
cursor.execute(sql)
video_id_list = cursor.fetchall()
preview_url = f"https://www.kuaishou.com/short-video/{video_id}?authorId={author_id}&streamSource=profile&area=profilexxnull"
print(author_id)
if len(video_id_list) == 0:
insert_sql = (
f"INSERT INTO zt_ks (author_id,author_name, playlet_title, playlet_url, issue_time, "
f"create_time,update_time,digg_count, three_digg_count, video_id, playlet_tag, "
f"playlet_duration, playlet_size,preview_url) VALUES ('{author_id}','{name}','{caption}','{photoUrl}',"
f"'{dt_object}','{now_date}','{now_date}',{likeCount},{0},'{video_id}','{new_tag}',"
f"'{dura}','{manifest}','{preview_url}')")
print(insert_sql)
cursor.execute(insert_sql)
db.commit()
logger.info(f"达人名字:{name} decs:{caption} 视频ID:{video_id}")
else:
three_likeCount = int(likeCount) - video_id_list[0][8]
update_sql = f"UPDATE zt_ks SET update_time = '{now_date}',digg_count = {likeCount}, three_digg_count = {three_likeCount}, preview_url='{preview_url}' WHERE video_id = '{video_id}'"
cursor.execute(update_sql)
db.commit()
logger.info('存在需要更新')
# print('存在需要更新')
mutex.release()
sleep(random.randint(1, 3))
else:
print('空的数据')
return '空'
# db = pymysql.connect(host="172.17.0.142", port=3306, user="jzzt", password="jzzt#2024", database="zt") # 服务
db = pymysql.connect(host="192.168.1.80", port=3306, user="remo80", password="juzhun2023", database="zt")
cursor = db.cursor()
select_sql = "SELECT author_url FROM `zt_ks_author` WHERE author_url!='' and status_flag=1"
cursor.execute(select_sql)
author_list = cursor.fetchall()
author_id = []
for author in author_list:
author_ids = author[0].split('/')
author_id.append(author_ids[len(author_ids) - 1])
cookies = []
cks = 'SELECT cookie FROM zt_dy_cookies WHERE status=1 and cookies_type=2'
cursor.execute(cks)
cookie_s = cursor.fetchall()
for cook in cookie_s:
cookies.append(cook[0])
# with ThreadPoolExecutor(max_workers=3) as executor:
# # 提交请求任务到线程池
# future_to_url = {executor.submit(spider_, au_id, cookies): au_id for au_id in author_id}
# # 收集结果
# for future in concurrent.futures.as_completed(future_to_url):
# thread_ = future_to_url[future]
#
# # try:
# # status_code = future.result()
# #
# # except Exception as exc:
# # print(exc)
# concurrent.futures.wait(future_to_url)
#
# logger.info('任务结束')
# cursor.close()
# db.close()
spider_('3x5bhpnp9kacggk',cookies)