-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy path01_scrape_mangas.py
168 lines (145 loc) · 6.14 KB
/
01_scrape_mangas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# Import general libraries
import time
import os
import sys
from datetime import datetime
# import our specific functions
from functions import manga_obj
from functions import manga_utils
from functions import anilist_helpers
# script parameters
url_domain = "https://mangadex.org/"
url_main = "https://api.mangadex.org/v2/"
dir_inout = "data/jsons/"
dir_logs = "data/logs/"
skip_already_downloaded = False
if len(sys.argv) == 3:
id_start = int(sys.argv[1])
id_end = int(sys.argv[2])
else:
id_start = 1
id_end = 60000
assert id_end >= id_start
assert id_end > 0
assert id_start > 0
# set api call settings
cookies = {}
if os.environ.get('mangadex_session'):
cookies['mangadex_session'] = os.environ.get('mangadex_session')
if os.environ.get('mangadex_rememberme_token'):
cookies['mangadex_rememberme_token'] = os.environ.get('mangadex_rememberme_token')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/77.0'
}
# if we have a cache'ed old file load it in
manga_data = []
manga_data_old = manga_utils.read_raw_manga_data_files(dir_inout)
print("loaded " + str(len(manga_data_old)) + " from file")
time_start = time.time()
# create output direction if not exists
cache_files = False
path_cache_manga_api = "data/page_manga_api/"
path_cache_manga_ext = "data/page_manga_ext/"
if cache_files:
manga_utils.make_dir_if_not(path_cache_manga_api)
manga_utils.make_dir_if_not(path_cache_manga_ext)
# loop through each index page, and extract the mangas
manga_count_updated = 0
manga_count_new = 0
manga_count = id_start
while manga_count <= id_end:
# create the object
t20 = time.time()
print("processing manga " + str(manga_count)+":")
data = manga_obj.MangaObj()
data.id = manga_count
data.url = url_domain + "manga/" + str(manga_count)
# check if this manga has already been downloaded
already_downloaded = False
downloaded_manga = None
for cmp_ct, cmp_manga in enumerate(manga_data_old):
if data.id == cmp_manga.id:
already_downloaded = True
downloaded_manga = cmp_manga
break
# if it is already downloaded and are being told not to pull
# then we should skip this manga so we don't need to process it!
if already_downloaded and skip_already_downloaded:
manga_data.append(downloaded_manga)
manga_count = manga_count + 1
continue
# if it is already downloaded, then copy over the matches
if already_downloaded:
data.matches = downloaded_manga.matches
# downloading the json api endpoint for this manga
# codes: 0: we got a 404, 1: success, 2: server 500 errors!
# success = data.download_and_parse_labels_soup(headers, cookies, cache_files, path_cache_manga)
success = data.download_and_parse_labels_json(url_main, headers, cookies, cache_files, path_cache_manga_api)
if success == 0:
manga_count = manga_count + 1
continue
if success == 1:
data.download_and_parse_externals(headers, cookies, cache_files, path_cache_manga_ext)
if success == 2 and already_downloaded:
data = downloaded_manga
# nice debug for this
t21 = time.time()
count_labels = len(data.content) + len(data.demographic) + len(data.format) + len(data.genre) + len(data.theme)
print(" -> "
+ str(count_labels) + " labels | "
+ str(data.count_chapters) + " chapters | "
+ str(round(data.rating, 2)) + " rating | "
+ str(len(data.related)) + " related | "
+ ("is r18" if data.is_r18 else "not r18")
+ " (" + str(round(t21 - t20, 2)) + " sec)")
# move forward in time
manga_data.append(data)
manga_count = manga_count + 1
manga_count_updated = manga_count_updated + 1
# Here check to see if we have ever seen this manga before!
# If we have not, then this is a new managa!
found_before = False
for cmp_ct, cmp_manga in enumerate(manga_data_old):
if data.id == cmp_manga.id:
found_before = True
if not found_before:
manga_count_new = manga_count_new + 1
# Remove any mangas that have been added to the json with the same id/title
# This could happen if the download of the manga list takes long
# and a new manga updated/added, thus changing what is on each manga list page.
ct_before = len(manga_data)
t01 = time.time()
manga_data = manga_utils.remove_dups_from_manga_list(manga_data)
ct_after = len(manga_data)
t11 = time.time()
# nice debug print
print("===========================")
print("reduced " + str(ct_before) + " to only " + str(ct_after) + " mangas (" + str(round(t11 - t01, 2)) + " seconds)")
# append all the mangas which were *not* updated in this set
count_num_appended = 0
for ct, manga1 in enumerate(manga_data_old):
if id_start <= manga1.id <= id_end:
continue
manga_data.append(manga1)
count_num_appended = count_num_appended + 1
print("appended " + str(count_num_appended) + " non-updated mangas (" + str(len(manga_data)) + " total now)")
# build the lookup table for the anilist ids to mangadex
# for all mangadex mangas that have linked to anilist
# this will be used to reverse the anilist api ids
(al2md, md2al, md2al_title) = anilist_helpers.build_mangadex_to_anilist_id_lookup(manga_data)
(mal2md, md2mal, md2mal_title) = anilist_helpers.build_mangadex_to_myanimelist_id_lookup(manga_data)
manga_data = anilist_helpers.append_anilist_related_and_recs(manga_data, al2md, md2al_title, mal2md, md2mal_title)
# Save our json to file!
manga_utils.write_raw_manga_data_files(dir_inout, manga_data)
print("outputted to " + dir_inout)
print("script took " + str(round(time.time() - time_start, 2))
+ " seconds ("+str(round((time.time() - time_start)/(id_end-id_start), 2))+" per manga)")
# output content to log file
manga_utils.make_dir_if_not(dir_logs)
with open(dir_logs+"log_scrape_manga.txt", "a") as myfile:
myfile.write(
"[" + datetime.utcnow().strftime("%B %d, %Y %H:%M:%S") + "]: " +
"Checked " + str(ct_after) + ", updated " + str(manga_count_updated) +
", " + str(manga_count_new) + " new mangas. " +
"Took " + str(round((time.time() - time_start)/60.0, 2)) + " minutes to complete.\n"
)