Skip to content

Commit

Permalink
优化输出
Browse files Browse the repository at this point in the history
  • Loading branch information
CYDXDianXian authored Feb 16, 2022
1 parent 933ec9d commit 53844a6
Showing 1 changed file with 29 additions and 19 deletions.
48 changes: 29 additions & 19 deletions pcr_cg_spider_multithreading.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,49 +19,59 @@

# 图片下载链接获取模块(同步)
def get_urls():
global get_urls_msg
start_time = time.time()
base = "https://redive.estertion.win/card/story/"
print('开始进行爬取,请稍后......\n')
resp = requests.get(base, headers = headers, proxies = proxy) # 以get方式请求url,得到响应赋值给resp。proxies = prox添加代理
if resp.status_code==404:
print(f"网页请求错误,错误代码:404")
try:
resp = requests.get(base, headers = headers, proxies = proxy, timeout = 30) # 以get方式请求url,得到响应赋值给resp。proxies = prox添加代理
except:
print('请求超时,请检查网络连接')
sys.exit() # 结束程序
resp.encodin = 'utf-8'
# print(resp.text)

tree = etree.HTML(resp.text)
result = tree.xpath("/html/body/span/a/@href") # 通过xpath提取链接列表

urls = [base + i for i in result] # 将result中的数据依次放入i中,然后将base与i依次拼接,放入列表urls中。
end_time = time.time()
use_time = int(end_time - start_time)
print(f'共爬取到{len(urls)}个文件地址,即将开始下载\n')
get_urls_msg = f'\n共爬取到{len(urls)}个文件,爬取用时{use_time}秒'
time.sleep(3)
return urls

# 图片下载模块(异步)
num = []
async def aiodownload(url):
global success_download, error_download # 必须加全局声明,否则下载计数会出现错误
success_download = 0
error_download = 0
name = url.split("/")[-1] # 拿到url中的最后一个/以后的内容(图片名)
if not Path(path, name).exists(): # 文件不存在则进行下载。Path(path, name)拼接路径与文件名
async with aiohttp.ClientSession(headers = headers) as session: # aiohttp.ClientSession()等价于requests。headers上下都可以放,放一个地方就好
async with session.get(url) as resp: # session.get()等价于requests.get()。以get方式请求url,得到响应赋值给resp。proxy来指明代理。timeout默认5分钟
Path(path, name).write_bytes(await resp.content.read()) # Path.write_bytes(data),将文件以二进制模式打开,写入二进制data并关闭。一个同名的现存文件将被覆盖。同时该方法完美解决图片出现0kb的bug
# resp.content.read()得到字节(二进制)对象,resp.text()得到字符串(文本)对象,resp.json()得到json对象。异步读取文件需要await挂起
# resp.content和resp.text只适用于requests.get().content和requests.get().text。resp.json()两种请求方式都一样
print("ok", name)
num.append(1)
else:
print(f'文件 {name} 已存在,不再进行下载')
try:
if not Path(path, name).exists(): # 文件不存在则进行下载。Path(path, name)拼接路径与文件名
async with aiohttp.ClientSession(headers = headers) as session: # aiohttp.ClientSession()等价于requests。headers上下都可以放,放一个地方就好
async with session.get(url) as resp: # session.get()等价于requests.get()。以get方式请求url,得到响应赋值给resp。proxy来指明代理。timeout默认5分钟
Path(path, name).write_bytes(await resp.content.read()) # Path.write_bytes(data),将文件以二进制模式打开,写入二进制data并关闭。一个同名的现存文件将被覆盖。同时该方法完美解决图片出现0kb的bug
# resp.content.read()得到字节(二进制)对象,resp.text()得到字符串(文本)对象,resp.json()得到json对象。异步读取文件需要await挂起
# resp.content和resp.text只适用于requests.get().content和requests.get().text。resp.json()两种请求方式都一样
print("ok", name)
success_download += 1
else:
print(f'文件 {name} 已存在,不再进行下载')
except:
error_download += 1
print(f'图片下载失败{error_download}个:{name}')

# 主协程对象
async def main():
start = time.time()

urls = get_urls() #后续需要调用两次urls,这样做只需调用一次get_urls函数,节省资源
start = time.time()
tasks = [aiodownload(url) for url in urls] # 生成执行任务的列表
await asyncio.wait(tasks)

end = time.time()

print(f'全部完成!!!,用时{end - start}秒,共下载成功{len(num)}个文件,{len(urls) - len(num)}个文件未下载\n')
print(get_urls_msg)
print(f'共下载成功{success_download}个文件,失败{error_download}个,下载用时{int(end - start)}\n')
print('程序将在10秒后结束......')
await asyncio.sleep(10)

Expand Down

0 comments on commit 53844a6

Please sign in to comment.