forked from Spicadox/auto-ytarchive-raw
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetjson.py
124 lines (100 loc) · 4.12 KB
/
getjson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import urllib.request
import urllib.parse
import re
import html
import base64
import datetime
import json
import utils
VERSION = "1.5"
PRIORITY = {
"VIDEO": [
337, 315, 266, 138, # 2160p60
313, 336, # 2160p
308, # 1440p60
271, 264, # 1440p
335, 303, 299, # 1080p60
248, 169, 137, # 1080p
334, 302, 298, # 720p60
247, 136 # 720p
],
"AUDIO": [
251, 141, 171, 140, 250, 249, 139
]
}
def parse(regex, html_raw):
match = re.search(regex, html_raw).group(1) or re.search(regex, html_raw).group(2)
return html.unescape(match)
def get_youtube_id(url):
try:
return re.search(r'^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*', url).group(1)
except:
with utils.urlopen(url) as response:
html_raw = response.read().decode()
regex = r'<meta itemprop="videoId" content="(.+?)">'
result = re.search(regex, html_raw).group(1)
return result
def get_youtube_video_info(video_id, channel_id, channel_name, html_raw):
thumbnail_url = parse(r'<link rel="image_src" href="(.+?)">', html_raw) if '<link rel="image_src" href="' in html_raw else f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg"
return {
"title": parse(r'<meta name="title" content="(.+?)">', html_raw),
"id": video_id,
"channelName": channel_name,
"channelURL": f"https://www.youtube.com/channel/{channel_id}",
"description": parse(r'"description":{"simpleText":"(.+?)"},', html_raw).replace("\\n", "\n") if '"description":{"simpleText":"' in html_raw else "",
"thumbnail": get_image(thumbnail_url),
"thumbnailUrl": thumbnail_url,
"startTimestamp": parse(r'"startTimestamp":"(.+?)"', html_raw)
}
def get_image(url):
with utils.urlopen(url) as response:
data = response.read()
b64 = base64.b64encode(data).decode()
return f"data:image/jpeg;base64,{b64}"
def build_req(video_id, use_cookie=False):
video_url = f"https://www.youtube.com/watch?v={video_id}"
info_req = urllib.request.Request(
video_url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
}
)
return utils.urlopen(info_req, use_cookie=use_cookie)
def get_json(video_url, channel_id, channel_name, file=None, require_cookie=False):
video_id = get_youtube_id(video_url)
with build_req(video_id, require_cookie) as response:
data = response.read().decode()
match = re.findall(r'"itag":(\d+),"url":"([^"]+)"', data)
match = dict(x for x in match)
best = {
"video": None,
"audio": None,
"metadata": get_youtube_video_info(video_id, channel_id, channel_name, data),
"version": VERSION,
"createTime": datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()
}
for itag in PRIORITY["VIDEO"]:
itag = str(itag)
if itag in match and "noclen" in match[itag]: # With `noclen` param, the video can be downloaded by fregments.
best["video"] = {
itag: match[itag].replace("\\u0026", "\u0026")
}
break
for itag in PRIORITY["AUDIO"]:
itag = str(itag)
if itag in match and "noclen" in match[itag]:
best["audio"] = {
itag: match[itag].replace("\\u0026", "\u0026")
}
break
if best["video"] is None or best["audio"] is None:
if best["video"] is None:
utils.warn(f" {video_id} got empty video sources.")
if best["audio"] is None:
utils.warn(f" {video_id} got empty audio sources.")
utils.warn("Failed to get json with cookies")
print(match)
if file is not None:
with open(file, "w", encoding="utf8") as f:
json.dump(best, f, indent=4, ensure_ascii=False)
return best