-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
120 lines (103 loc) · 4.44 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/python3
import logging
import os
import time
from concurrent.futures import ThreadPoolExecutor
import urllib3
import re
import sys
import ssl
from utils import Config
import random
executor = ThreadPoolExecutor(Config.THREAD_NUM)
# Disable ssl warning. (Why not?)
urllib3.disable_warnings()
http = urllib3.PoolManager(cert_reqs=ssl.CERT_NONE, retries=3, timeout=10, num_pools=64, maxsize=64)
logging.basicConfig(level=Config.LOG_LEVEL, format=Config.LOG_FORMAT)
def download_and_save(url, header, name):
downloaded = False
for ext in Config.EXT:
response = http.request(method="GET",
url=url.replace(".jpg", ext),
headers=header)
if response.status == 200:
path = Config.OUTPUT + "/" + name + ext
with open(path, 'wb') as fp:
fp.write(response.data)
fp.flush()
logging.info("{} {}".format("Save to:", path))
downloaded = True
break
if not downloaded:
logging.info("{} {} {} {}".format(url, "HTTP", response.status, 'failed.'))
def crawl(tags):
rule = Config.RULE.get(Config.SITE, None)
if rule is None:
logging.error("{} {} {}".format("Site", Config.SITE, "not supported."))
sys.exit(-1)
page = 0
while True:
# Common booru site url schema.
url = rule.get('url') + " ".join(tags) + '&pid=' + str(Config.NUM * page)
response = http.request(method="GET",
url=url,
headers={'User-Agent': Config.UA})
s = str(response.data, encoding='utf-8')
header = {'User-Agent': Config.UA, 'Referer': url}
images = re.findall(rule.get('url_reg'), s)
for img in images:
# Rip for original size url from the url of the thumbnail.
img = img.replace("thumbnails", "images").replace("thumbnail_", "")
# Get the fixed unique name of the image for saving and judging the existed image.
name = re.findall(rule.get('name_reg'), img)[0]
downloaded = False
for ext in Config.EXT:
path = Config.OUTPUT + name + ext
if os.path.exists(path):
downloaded = True
break
if downloaded:
logging.warning("{} {}".format("Exist:", path))
continue
# Submit to ThreadPool for downloading.
executor.submit(download_and_save, img, header, name)
# Ease the load of the image server.
time.sleep(random.randint(1, 2))
# If no more images on the page.
if images is None or len(images) == 0:
logging.warning('No more image to download.')
logging.warning('If this is the first page, please check the tags name or check for the script update.')
return
page = page + 1
logging.info("{} {}".format("Fetching no.", page * Config.NUM))
def print_help():
print("Usage: main.py <arguments>")
print("Example: main.py --tag='megumin'"
" --tags='kono_subarashii_sekai_ni_shukufuku_wo! megumin'"
" --site='gelbooru' --thread-num=8")
print("")
print("Arguments:")
print("--tag='<tag name>' : Single tag name. Can implement several times.")
print("--tags='<tag names>': Multiple tag names. Each tag name separate by space. "
"Replace space with underscore in one single tag.")
print("")
print("--log-level=<num> : (Optional) 10-DEBUG,20-INFO,30-WARNING,40-ERROR,50-CRITICAL. default=20")
print("--output=<path> : (Optional) Output directory path. default='./download/'")
print("--site=<site name> : (Optional) Site name. Currently safebooru and gelbooru supported. default='safebooru'")
print("--thread-num=<num> : (Optional) How many downloads allow at the same time. default=16")
tags = set()
for v in sys.argv[1:]:
if len(v.split("=", 1)) == 2:
_k, _v = v.split("=", 1)
_k = _k.lstrip("-")
if _k == 'tag':
tags.add(_v.strip('"').strip("'").replace(' ', '_'))
elif _k == 'tags':
tags = tags.union(set(_v.strip('"').strip("'").split(" ")))
else:
setattr(Config, _k.upper().replace("-", "_"), _v.strip('"').strip("'"))
if len(tags) == 0:
print_help()
sys.exit(-1)
logging.info("{} {} {} {}".format('Fetching tags', tags, 'from site', Config.SITE))
crawl(tags)