-
Notifications
You must be signed in to change notification settings - Fork 4k
/
python_splash.py
42 lines (32 loc) · 1.17 KB
/
python_splash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# _*_ coding: utf-8 _*_
"""
使用Splash服务器抓取Ajax渲染页面
"""
import json
import requests
# Docker安装: https://splash.readthedocs.io/en/latest/install.html
CRAWLER_URL = "http://weixin.sogou.com/weixin?page=1&type=2&query=%E4%B8%AD%E5%9B%BD"
# render.html
def test_1(url):
render = "http://xx.xx.xx.xx:8050/render.html"
body = json.dumps({
"url": url,
"wait": 0.5, # 设定页面加载等待时间
"images": 0, # 是否抓取图片
"timeout": 3, # 设置过期时间
# "allowed_domains": ["sogou.com", ], # 设置允许的域
"allowed_content_types": "text/html; charset=utf-8"
})
headers = {"Content-Type": "application/json"}
response = requests.post(url=render, headers=headers, data=body)
print(url, response.status_code)
print(response.text)
return
# test_1(CRAWLER_URL)
# render.png
def test_2(url):
render = "http://xx.xx.xx.xx:8050/render.png?url=%s&timeout=5" % url
response = requests.get(url=render)
print(url, response.status_code)
return
# test_2(CRAWLER_URL)