-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
136 lines (119 loc) · 4.47 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import requests
import urllib.parse
from requests.exceptions import RequestException
from urllib.parse import urljoin
from lxml import etree
import re
import json
# 百度搜索接口
def format_url(url, params: dict=None) -> str:
query_str = urllib.parse.urlencode(params)
return f'{ url }?{ query_str }'
def get_url(keyword):
params = {
'wd': str(keyword)
}
url = "https://www.baidu.com/s"
url = format_url(url, params)
# print(url)
return url
def get_page(url):
try:
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
}
response = requests.get(url=url,headers=headers)
# 更改编码方式,否则会出现乱码的情况
response.encoding = "utf-8"
print(response.status_code)
# print(response.text)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_page(url,page):
lis = [3001,3002,3003,4001,5001,5002]
for i in range(1,int(page)):
print("正在爬取第{}页....".format(i))
title = ""
sub_url = ""
abstract = ""
flag = 11
if i == 1:
flag = 10
html = get_page(url)
content = etree.HTML(html)
for j in lis:
title = ""
sub_url = ""
abstract = ""
# str_abstract = ""
try:
# print("j="+str(j))
title = content.xpath('//*[@id="%d"]/div[1]/h3/a'%j)
if title:
title = title[0].xpath('string(.)')
print(title)
sub_url = content.xpath('//*[@id="%d"]/div[1]/h3/a/@href'%j)
if sub_url:
sub_url = sub_url[0]
print(sub_url)
str_abstract = content.xpath('//*[@id="%d"]/div[2]/div[1]/a'%j)
if str_abstract:
abstract = abstract[0].xpath('string(.)')
else:
str_abstract = content.xpath('//*[@id="%d"]/div[2]/div/div[2]/div[1]/a'%j)
if str_abstract:
abstract = abstract[0].xpath('string(.)')
else:
str_abstract = content.xpath('//*[@id="%d"]/div[2]/div/div[2]/div'%j)
if str_abstract:
abstract = abstract[0].xpath('string(.)')
print(abstract)
except:
pass
# data = {}
# res_title = content.xpath('//*[@id=""]/h3/a' % ((i - 1) * 10 + j))
# if res_title:
# title = res_title[0].xpath('string(.)')
#
# sub_url = content.xpath('//*[@id="%d"]/h3/a/@href' % ((i - 1) * 10 + j))
# if sub_url:
# sub_url = sub_url[0]
#
# res_abstract = content.xpath('//*[@id="%d"]/div[@class="c-abstract"]'%((i-1)*10+j))
# if res_abstract:
# abstract = res_abstract[0].xpath('string(.)')
# else:
# res_abstract = content.xpath('//*[@id="%d"]/div/div[2]/div[@class="c-abstract"]'%((i-1)*10+j))
# if res_abstract:
# abstract = res_abstract[0].xpath('string(.)')
# # res_abstract = content.xpath('//*[@id="%d"]/div/div[2]/p[1]'%((i-1)*10+j))
# if not abstract:
# abstract = content.xpath('//*[@id="%d"]/div/div[2]/p[1]'%((i-1)*10+j))[0].xpath('string(.)')
# data['title'] = title
# data['sub_url'] = sub_url
# data['abstract'] = abstract
rel_url = content.xpath('//*[@id="page"]/a[{}]/@href'.format(flag))
if rel_url:
url = urljoin(url, rel_url[0])
else:
print("无更多页面!~")
return
# yield data
def main():
keyword = input("输入关键字:")
page = input("输入查找页数:")
url = get_url(keyword)
parse_page(url,page)
# 写入文件
# file = open("data.json", 'w+', encoding='utf-8')
# for result in results:
# print(result)
# file.write(json.dumps(result, indent=2, ensure_ascii=False))
if __name__ == '__main__':
main()