diff --git a/lib/yt_dlp/cookies.py b/lib/yt_dlp/cookies.py index 0de0672e1..815897d5a 100644 --- a/lib/yt_dlp/cookies.py +++ b/lib/yt_dlp/cookies.py @@ -46,7 +46,7 @@ from .utils._utils import _YDLLogger from .utils.networking import normalize_url -CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} +CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'whale'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} @@ -219,6 +219,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'), 'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'), 'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'), + 'whale': os.path.join(appdata_local, R'Naver\Naver Whale\User Data'), }[browser_name] elif sys.platform == 'darwin': @@ -230,6 +231,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': os.path.join(appdata, 'Microsoft Edge'), 'opera': os.path.join(appdata, 'com.operasoftware.Opera'), 'vivaldi': os.path.join(appdata, 'Vivaldi'), + 'whale': os.path.join(appdata, 'Naver/Whale'), }[browser_name] else: @@ -241,6 +243,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': os.path.join(config, 'microsoft-edge'), 'opera': os.path.join(config, 'opera'), 'vivaldi': os.path.join(config, 'vivaldi'), + 'whale': os.path.join(config, 'naver-whale'), }[browser_name] # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE: @@ -252,6 +255,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium', 'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium', 'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome', + 'whale': 'Whale', }[browser_name] browsers_without_profiles = {'opera'} diff --git a/lib/yt_dlp/extractor/bbc.py b/lib/yt_dlp/extractor/bbc.py index 015af9e1d..f6b58b361 100644 --- a/lib/yt_dlp/extractor/bbc.py +++ b/lib/yt_dlp/extractor/bbc.py @@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade', + 'title': 'Russia stages massive WW2 parade despite Western boycott', 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, @@ -623,6 +623,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', 'title': 'BUGGER', + 'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$', }, 'playlist_count': 18, }, { @@ -631,14 +632,14 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': 'p02mprgb', 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'md5:2868290467291b37feda7863f7a83f54', + 'title': 'Germanwings crash site aerial video', + 'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$', 'duration': 47, 'timestamp': 1427219242, 'upload_date': '20150324', + 'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg', }, 'params': { - # rtmp download 'skip_download': True, } }, { @@ -656,21 +657,24 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'params': { 'skip_download': True, - } + }, + 'skip': 'now SIMORGH_DATA with no video', }, { # single video embedded with data-playable containing XML playlists (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'info_dict': { - 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', + 'id': '39275083', + 'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', - 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', + 'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', 'timestamp': 1434713142, 'upload_date': '20150619', + 'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg', }, 'params': { 'skip_download': True, - } + }, }, { # single video from video playlist embedded with vxp-playlist-data JSON 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', @@ -683,22 +687,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'params': { 'skip_download': True, - } + }, + 'skip': '404 Not Found', }, { - # single video story with digitalData + # single video story with __PWA_PRELOADED_STATE__ 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', 'info_dict': { 'id': 'p02q6gc4', - 'ext': 'flv', - 'title': 'Sri Lanka’s spicy secret', - 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', - 'timestamp': 1437674293, - 'upload_date': '20150723', + 'ext': 'mp4', + 'title': 'Tasting the spice of life in Jaffna', + 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$', + 'timestamp': 1646058397, + 'upload_date': '20220228', + 'duration': 255, + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg', }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { # single video story without digitalData 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', @@ -710,12 +713,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'timestamp': 1415867444, 'upload_date': '20141113', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'skip': 'redirects to TopGear home page', }, { # single video embedded with Morph + # TODO: replacement test page 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', 'info_dict': { 'id': 'p041vhd0', @@ -726,27 +727,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'uploader': 'BBC Sport', 'uploader_id': 'bbc_sport', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Georestricted to UK', + 'skip': 'Video no longer in page', }, { - # single video with playlist.sxml URL in playlist param + # single video in __INITIAL_DATA__ 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', + 'title': 'Ronaldo to Man Utd, Arsenal to spend?', + 'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$', + 'timestamp': 1437750175, + 'upload_date': '20150724', + 'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png', 'duration': 140, }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { - # article with multiple videos embedded with playlist.sxml in playlist param + # article with multiple videos embedded with Morph.setPayload 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': '34475836', @@ -754,6 +750,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', }, 'playlist_count': 3, + }, { + # Testing noplaylist + 'url': 'http://www.bbc.com/sport/0/football/34475836', + 'info_dict': { + 'id': 'p034ppnv', + 'ext': 'mp4', + 'title': 'All you need to know about Jurgen Klopp', + 'timestamp': 1444335081, + 'upload_date': '20151008', + 'duration': 122.0, + 'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg', + }, + 'params': { + 'noplaylist': True, + }, }, { # school report article with single video 'url': 'http://www.bbc.co.uk/schoolreport/35744779', @@ -762,6 +773,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'title': 'School which breaks down barriers in Jerusalem', }, 'playlist_count': 1, + 'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt', }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', @@ -778,18 +790,33 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1437785037, 'upload_date': '20150725', + 'duration': 105, }, }, { # video with window.__INITIAL_DATA__ and value as JSON string 'url': 'https://www.bbc.com/news/av/world-europe-59468682', 'info_dict': { - 'id': 'p0b71qth', + 'id': 'p0b779gc', 'ext': 'mp4', 'title': 'Why France is making this woman a national hero', - 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.', 'thumbnail': r're:https?://.+/.+\.jpg', - 'timestamp': 1638230731, - 'upload_date': '20211130', + 'timestamp': 1638215626, + 'upload_date': '20211129', + 'duration': 125, + }, + }, { + # video with script id __NEXT_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/uk-68546268', + 'info_dict': { + 'id': 'p0hj0lq7', + 'ext': 'mp4', + 'title': 'Nasser Hospital doctor describes his treatment by IDF', + 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1710188248, + 'upload_date': '20240311', + 'duration': 104, }, }, { # single video article embedded with data-media-vpid @@ -817,6 +844,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'uploader': 'Radio 3', 'uploader_id': 'bbc_radio_three', }, + 'skip': '404 Not Found', }, { 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', 'info_dict': { @@ -824,6 +852,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'ext': 'mp4', 'title': 'md5:2fabf12a726603193a2879a055f72514', 'description': 'Learn English words and phrases from this story', + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg', }, 'add_ie': [BBCCoUkIE.ie_key()], }, { @@ -832,28 +861,30 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': 'p07c6sb9', 'ext': 'mp4', - 'title': 'How positive thinking is harming your happiness', - 'alt_title': 'The downsides of positive thinking', - 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', + 'title': 'The downsides of positive thinking', + 'description': 'The downsides of positive thinking', 'duration': 235, - 'thumbnail': r're:https?://.+/p07c9dsr.jpg', - 'upload_date': '20190604', - 'categories': ['Psychology'], + 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)', + 'upload_date': '20220223', + 'timestamp': 1645632746, }, }, { # BBC Sounds - 'url': 'https://www.bbc.co.uk/sounds/play/m001q78b', + 'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx', 'info_dict': { - 'id': 'm001q789', + 'id': 'p0hrw4nr', 'ext': 'mp4', - 'title': 'The Night Tracks Mix - Music for the darkling hour', - 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg', - 'chapters': 'count:8', - 'description': 'md5:815fb51cbdaa270040aab8145b3f1d67', - 'uploader': 'Radio 3', - 'duration': 1800, - 'uploader_id': 'bbc_radio_three', - }, + 'title': 'Are our coastlines being washed away?', + 'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$', + 'timestamp': 1713556800, + 'upload_date': '20240419', + 'duration': 1588, + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg', + 'uploader': 'World Service', + 'uploader_id': 'bbc_world_service', + 'series': 'CrowdScience', + 'chapters': [], + } }, { # onion routes 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', 'only_matching': True, @@ -1008,8 +1039,7 @@ def _real_extract(self, url): webpage, 'group id', default=None) if group_id: return self.url_result( - 'https://www.bbc.co.uk/programmes/%s' % group_id, - ie=BBCCoUkIE.ie_key()) + f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( @@ -1069,83 +1099,133 @@ def _real_extract(self, url): } # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) - # There are several setPayload calls may be present but the video - # seems to be always related to the first one - morph_payload = self._parse_json( - self._search_regex( - r'Morph\.setPayload\([^,]+,\s*({.+?})\);', - webpage, 'morph payload', default='{}'), - playlist_id, fatal=False) + # Several setPayload calls may be present but the video(s) + # should be in one that mentions leadMedia or videoData + morph_payload = self._search_json( + r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id, + contains_pattern=r'{(?s:(?:(?!).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}', + default={}) if morph_payload: - components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] - for component in components: - if not isinstance(component, dict): - continue - lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) - if not lead_media: - continue - identifiers = lead_media.get('identifiers') - if not identifiers or not isinstance(identifiers, dict): - continue - programme_id = identifiers.get('vpid') or identifiers.get('playablePid') + for lead_media in traverse_obj(morph_payload, ( + 'body', 'components', ..., 'props', 'leadMedia', {dict})): + programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any)) if not programme_id: continue - title = lead_media.get('title') or self._og_search_title(webpage) formats, subtitles = self._download_media_selector(programme_id) - description = lead_media.get('summary') - uploader = lead_media.get('masterBrand') - uploader_id = lead_media.get('mid') - duration = None - duration_d = lead_media.get('duration') - if isinstance(duration_d, dict): - duration = parse_duration(dict_get( - duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) return { 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'title': lead_media.get('title') or self._og_search_title(webpage), + **traverse_obj(lead_media, { + 'description': ('summary', {str}), + 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}), + 'uploader': ('masterBrand', {str}), + 'uploader_id': ('mid', {str}), + }), 'formats': formats, 'subtitles': subtitles, } + body = self._parse_json(traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'body')), playlist_id, fatal=False) + for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')): + if video_data.get('vpid'): + video_id = video_data['vpid'] + formats, subtitles = self._download_media_selector(video_id) + entry = { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + else: + video_id = video_data['pid'] + entry = self.url_result( + f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE, + video_id, url_transparent=True) + entry.update({ + 'timestamp': traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601}) + ), + **traverse_obj(video_data, { + 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any), + 'title': (('title', 'caption'), {str}, any), + 'duration': ('duration', {parse_duration}), + }), + }) + if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id): + return entry + entries.append(entry) + if entries: + playlist_title = traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'headline', {str})) or playlist_title + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) - preload_state = self._parse_json(self._search_regex( - r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) - if preload_state: - current_programme = preload_state.get('programmes', {}).get('current') or {} - programme_id = current_programme.get('id') - if current_programme and programme_id and current_programme.get('type') == 'playable_item': - title = current_programme.get('titles', {}).get('tertiary') or playlist_title - formats, subtitles = self._download_media_selector(programme_id) - synopses = current_programme.get('synopses') or {} - network = current_programme.get('network') or {} - duration = int_or_none( - current_programme.get('duration', {}).get('value')) - thumbnail = None - image_url = current_programme.get('image_url') - if image_url: - thumbnail = image_url.replace('{recipe}', 'raw') + # various PRELOADED_STATE JSON + preload_state = self._search_json( + r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage, + 'preload state', playlist_id, transform_source=js_to_json, default={}) + # PRELOADED_STATE with current programmme + current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict})) + programme_id = traverse_obj(current_programme, ('id', {str})) + if programme_id and current_programme.get('type') == 'playable_item': + title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + return { + 'id': programme_id, + 'title': title, + 'formats': formats, + **traverse_obj(current_programme, { + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), + 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}), + 'duration': ('duration', 'value', {int_or_none}), + 'uploader': ('network', 'short_title', {str}), + 'uploader_id': ('network', 'id', {str}), + 'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any), + 'series': ('titles', 'primary', {str}), + }), + 'subtitles': subtitles, + 'chapters': traverse_obj(preload_state, ( + 'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), { + 'title': ('titles', {lambda x: join_nonempty( + 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), + 'start_time': ('offset', 'start', {float_or_none}), + 'end_time': ('offset', 'end', {float_or_none}), + }) + ), + } + + # PWA_PRELOADED_STATE with article video asset + asset_id = traverse_obj(preload_state, ( + 'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id, + 'assetVideo', 0, {str}, any)) + if asset_id: + video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str})) + if video_id: + article = traverse_obj(preload_state, ( + 'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any)) + + def image_url(image_id): + return traverse_obj(preload_state, ( + 'entities', 'images', image_id, 'url', + {lambda u: url_or_none(u.replace('$recipe', 'raw'))})) + + formats, subtitles = self._download_media_selector(video_id) return { - 'id': programme_id, - 'title': title, - 'description': dict_get(synopses, ('long', 'medium', 'short')), - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': network.get('short_title'), - 'uploader_id': network.get('id'), + 'id': video_id, + **traverse_obj(preload_state, ('entities', 'videos', asset_id, { + 'title': ('title', {str}), + 'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any), + 'thumbnail': (0, {image_url}), + 'duration': ('duration', {int_or_none}), + })), 'formats': formats, 'subtitles': subtitles, - 'chapters': traverse_obj(preload_state, ( - 'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), { - 'title': ('titles', {lambda x: join_nonempty( - 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), - 'start_time': ('offset', 'start', {float_or_none}), - 'end_time': ('offset', 'end', {float_or_none}), - })) or None, + 'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})), } + else: + return self.url_result( + f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE, + asset_id, playlist_title, display_id=playlist_id, + description=playlist_description) bbc3_config = self._parse_json( self._search_regex( @@ -1191,6 +1271,28 @@ def _real_extract(self, url): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + def parse_model(model): + """Extract single video from model structure""" + item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) + if not item_id: + return + formats, subtitles = self._download_media_selector(item_id) + return { + 'id': item_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any), + 'duration': ('versions', 0, 'duration', {int}), + 'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}), + }) + } + + def is_type(*types): + return lambda _, v: v['type'] in types + initial_data = self._search_regex( r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, 'quoted preload state', default=None) @@ -1202,6 +1304,19 @@ def _real_extract(self, url): initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: + for video_data in traverse_obj(initial_data, ( + 'stores', 'article', 'articleBodyContent', is_type('video'))): + model = traverse_obj(video_data, ( + 'model', 'blocks', is_type('aresMedia'), + 'model', 'blocks', is_type('aresMediaMetadata'), + 'model', {dict}, any)) + entry = parse_model(model) + if entry: + entries.append(entry) + if entries: + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def parse_media(media): if not media: return @@ -1234,27 +1349,90 @@ def parse_media(media): 'subtitles': subtitles, 'timestamp': item_time, 'description': strip_or_none(item_desc), + 'duration': int_or_none(item.get('duration')), }) - for resp in (initial_data.get('data') or {}).values(): - name = resp.get('name') + + for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])): + name = resp['name'] if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, - (lambda x: x['data']['blocks'], - lambda x: x['data']['content']['model']['blocks'],), - list) or []): - if block.get('type') not in ['media', 'video']: - continue - parse_media(block.get('model')) + for block in traverse_obj(resp, ( + 'data', (None, ('content', 'model')), 'blocks', + is_type('media', 'video'), 'model', {dict})): + parse_media(block) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + # extract from SIMORGH_DATA hydration JSON + simorgh_data = self._search_json( + r'window\s*\.\s*SIMORGH_DATA\s*=', webpage, + 'simorgh data', playlist_id, default={}) + if simorgh_data: + done = False + for video_data in traverse_obj(simorgh_data, ( + 'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))): + model = traverse_obj(video_data, ( + 'model', 'blocks', is_type('aresMedia'), + 'model', 'blocks', is_type('aresMediaMetadata'), + 'model', {dict}, any)) + if video_data['type'] == 'video': + entry = parse_model(model) + else: # legacyMedia: no duration, subtitles + block_id, entry = traverse_obj(model, ('blockId', {str})), None + media_data = traverse_obj(simorgh_data, ( + 'pageData', 'promo', 'media', + {lambda x: x if x['id'] == block_id else None})) + formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), { + 'url': ('url', {url_or_none}), + 'ext': ('format', {str}), + 'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), + })) + if formats: + entry = { + 'id': block_id, + 'display_id': playlist_id, + 'formats': formats, + 'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})), + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), + 'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}), + }), + } + done = True + if entry: + entries.append(entry) + if done: + break + if entries: + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), re.findall(pattern, webpage)))) + # US accessed article with single embedded video (e.g. + # https://www.bbc.com/news/uk-68546268) + next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), + ('props', 'pageProps', 'page')) + model = traverse_obj(next_data, ( + ..., 'contents', is_type('video'), + 'model', 'blocks', is_type('media'), + 'model', 'blocks', is_type('mediaMetadata'), + 'model', {dict}, any)) + if model and (entry := parse_model(model)): + if not entry.get('timestamp'): + entry['timestamp'] = traverse_obj(next_data, ( + ..., 'contents', is_type('timestamp'), 'model', + 'timestamp', {functools.partial(int_or_none, scale=1000)}, any)) + entries.append(entry) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX diff --git a/lib/yt_dlp/extractor/cda.py b/lib/yt_dlp/extractor/cda.py index 90b4d082e..0a5a524c1 100644 --- a/lib/yt_dlp/extractor/cda.py +++ b/lib/yt_dlp/extractor/cda.py @@ -16,7 +16,6 @@ merge_dicts, multipart_encode, parse_duration, - random_birthday, traverse_obj, try_call, try_get, @@ -63,38 +62,57 @@ class CDAIE(InfoExtractor): 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'crash404', - 'view_count': int, 'average_rating': float, 'duration': 137, 'age_limit': 0, + 'upload_date': '20160220', + 'timestamp': 1455968218, } }, { - # Age-restricted - 'url': 'http://www.cda.pl/video/1273454c4', + # Age-restricted with vfilm redirection + 'url': 'https://www.cda.pl/video/8753244c4', + 'md5': 'd8eeb83d63611289507010d3df3bb8b3', 'info_dict': { - 'id': '1273454c4', + 'id': '8753244c4', 'ext': 'mp4', - 'title': 'Bronson (2008) napisy HD 1080p', - 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'title': '[18+] Bez Filtra: Rezerwowe Psy czyli... najwulgarniejsza polska gra?', + 'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e', 'height': 1080, - 'uploader': 'boniek61', + 'uploader': 'arhn eu', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 5554, + 'duration': 991, 'age_limit': 18, - 'view_count': int, 'average_rating': float, - }, + 'timestamp': 1633888264, + 'upload_date': '20211010', + } + }, { + # Age-restricted without vfilm redirection + 'url': 'https://www.cda.pl/video/17028157b8', + 'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992', + 'info_dict': { + 'id': '17028157b8', + 'ext': 'mp4', + 'title': 'STENDUPY MICHAŁ OGIŃSKI', + 'description': 'md5:5851f3272bfc31f762d616040a1d609a', + 'height': 480, + 'uploader': 'oginski', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 18855, + 'age_limit': 18, + 'average_rating': float, + 'timestamp': 1699705901, + 'upload_date': '20231111', + } }, { 'url': 'http://ebd.cda.pl/0x0/5749950c', 'only_matching': True, }] def _download_age_confirm_page(self, url, video_id, *args, **kwargs): - form_data = random_birthday('rok', 'miesiac', 'dzien') - form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) - data, content_type = multipart_encode(form_data) + data, content_type = multipart_encode({'age_confirm': ''}) return self._download_webpage( - urljoin(url, '/a/validatebirth'), video_id, *args, + url, video_id, *args, data=data, headers={ 'Referer': url, 'Content-Type': content_type, @@ -164,7 +182,7 @@ def _real_extract(self, url): if 'Authorization' in self._API_HEADERS: return self._api_extract(video_id) else: - return self._web_extract(video_id, url) + return self._web_extract(video_id) def _api_extract(self, video_id): meta = self._download_json( @@ -197,9 +215,9 @@ def _api_extract(self, video_id): 'view_count': meta.get('views'), } - def _web_extract(self, video_id, url): + def _web_extract(self, video_id): self._set_cookie('cda.pl', 'cda.player', 'html5') - webpage = self._download_webpage( + webpage, urlh = self._download_webpage_handle( f'{self._BASE_URL}/video/{video_id}/vfilm', video_id) if 'Ten film jest dostępny dla użytkowników premium' in webpage: @@ -209,10 +227,10 @@ def _web_extract(self, video_id, url): self.raise_geo_restricted() need_confirm_age = False - if self._html_search_regex(r'(]+action="[^"]*/a/validatebirth[^"]*")', + if self._html_search_regex(r'(]+name="[^"]*age_confirm[^"]*")', webpage, 'birthday validate form', default=None): webpage = self._download_age_confirm_page( - url, video_id, note='Confirming age') + urlh.url, video_id, note='Confirming age') need_confirm_age = True formats = [] @@ -222,9 +240,6 @@ def _web_extract(self, video_id, url): (?:<\1[^>]*>[^<]*|(?!)(?:.|\n))*? <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P[^<]+) ''', webpage, 'uploader', default=None, group='uploader') - view_count = self._search_regex( - r'Odsłony:(?:\s| )*([0-9]+)', webpage, - 'view_count', default=None) average_rating = self._search_regex( (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P[0-9.]+)', r']+\bclass=["\']rating["\'][^>]*>(?P[0-9.]+)'), webpage, 'rating', fatal=False, @@ -235,7 +250,6 @@ def _web_extract(self, video_id, url): 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'uploader': uploader, - 'view_count': int_or_none(view_count), 'average_rating': float_or_none(average_rating), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, diff --git a/lib/yt_dlp/extractor/common.py b/lib/yt_dlp/extractor/common.py index bebbc6b43..e232aa883 100644 --- a/lib/yt_dlp/extractor/common.py +++ b/lib/yt_dlp/extractor/common.py @@ -957,7 +957,8 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote= if urlh is False: assert not fatal return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data) return (content, urlh) @staticmethod @@ -1005,8 +1006,10 @@ def __check_blocked(self, content): 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', expected=True) - def _request_dump_filename(self, url, video_id): - basen = f'{video_id}_{url}' + def _request_dump_filename(self, url, video_id, data=None): + if data is not None: + data = hashlib.md5(data).hexdigest() + basen = join_nonempty(video_id, data, url, delim='_') trim_length = self.get_param('trim_file_name') or 240 if len(basen) > trim_length: h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() @@ -1028,16 +1031,18 @@ def __decode_webpage(self, webpage_bytes, encoding, headers): except LookupError: return webpage_bytes.decode('utf-8', 'replace') - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, + prefix=None, encoding=None, data=None): webpage_bytes = urlh.read() if prefix is not None: webpage_bytes = prefix + webpage_bytes + url_or_request = self._create_request(url_or_request, data) if self.get_param('dump_intermediate_pages', False): self.to_screen('Dumping request to ' + urlh.url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self.get_param('write_pages'): - filename = self._request_dump_filename(urlh.url, video_id) + filename = self._request_dump_filename(urlh.url, video_id, url_or_request.data) self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -1098,7 +1103,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote, impersonate=None, require_impersonation=False): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) - filename = self._request_dump_filename(url_or_request.url, video_id) + filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data) self.to_screen(f'Loading request from {filename}') try: with open(filename, 'rb') as dumpf: diff --git a/lib/yt_dlp/extractor/tiktok.py b/lib/yt_dlp/extractor/tiktok.py index 3d965dd45..2fb41ba79 100644 --- a/lib/yt_dlp/extractor/tiktok.py +++ b/lib/yt_dlp/extractor/tiktok.py @@ -45,19 +45,18 @@ class TikTokBaseIE(InfoExtractor): # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0 'aid': '0', } - _KNOWN_APP_INFO = [ - '7351144126450059040', - '7351149742343391009', - '7351153174894626592', - ] _APP_INFO_POOL = None _APP_INFO = None _APP_USER_AGENT = None + @property + def _KNOWN_APP_INFO(self): + return self._configuration_arg('app_info', ie_key=TikTokIE) + @property def _API_HOSTNAME(self): return self._configuration_arg( - 'api_hostname', ['api22-normal-c-useast2a.tiktokv.com'], ie_key=TikTokIE)[0] + 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0] def _get_next_app_info(self): if self._APP_INFO_POOL is None: @@ -66,13 +65,10 @@ def _get_next_app_info(self): for key, default in self._APP_INFO_DEFAULTS.items() if key != 'iid' } - app_info_list = ( - self._configuration_arg('app_info', ie_key=TikTokIE) - or random.sample(self._KNOWN_APP_INFO, len(self._KNOWN_APP_INFO))) self._APP_INFO_POOL = [ {**defaults, **dict( (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v - )} for app_info in app_info_list + )} for app_info in self._KNOWN_APP_INFO ] if not self._APP_INFO_POOL: @@ -757,11 +753,13 @@ class TikTokIE(TikTokBaseIE): def _real_extract(self, url): video_id, user_id = self._match_valid_url(url).group('id', 'user_id') - try: - return self._extract_aweme_app(video_id) - except ExtractorError as e: - e.expected = True - self.report_warning(f'{e}; trying with webpage') + + if self._KNOWN_APP_INFO: + try: + return self._extract_aweme_app(video_id) + except ExtractorError as e: + e.expected = True + self.report_warning(f'{e}; trying with webpage') url = self._create_url(user_id, video_id) webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}) diff --git a/lib/yt_dlp/extractor/twitter.py b/lib/yt_dlp/extractor/twitter.py index ecc865655..df7f816bd 100644 --- a/lib/yt_dlp/extractor/twitter.py +++ b/lib/yt_dlp/extractor/twitter.py @@ -36,7 +36,7 @@ class TwitterBaseIE(InfoExtractor): _NETRC_MACHINE = 'twitter' _API_BASE = 'https://api.twitter.com/1.1/' _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' - _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' + _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:(?:twitter|x)\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' _AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' _LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE' _flow_token = None @@ -1191,6 +1191,31 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 0, '_old_archive_ids': ['twitter 1724884212803834154'], }, + }, { + # x.com + 'url': 'https://x.com/historyinmemes/status/1790637656616943991', + 'md5': 'daca3952ba0defe2cfafb1276d4c1ea5', + 'info_dict': { + 'id': '1790637589910654976', + 'ext': 'mp4', + 'title': 'Historic Vids - One of the most intense moments in history', + 'description': 'One of the most intense moments in history https://t.co/Zgzhvix8ES', + 'display_id': '1790637656616943991', + 'uploader': 'Historic Vids', + 'uploader_id': 'historyinmemes', + 'uploader_url': 'https://twitter.com/historyinmemes', + 'channel_id': '855481986290524160', + 'upload_date': '20240515', + 'timestamp': 1715756260.0, + 'duration': 15.488, + 'tags': [], + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', + 'age_limit': 0, + '_old_archive_ids': ['twitter 1790637656616943991'], + } }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', diff --git a/lib/yt_dlp/extractor/youtube.py b/lib/yt_dlp/extractor/youtube.py index a5fe179c2..e676c5cde 100644 --- a/lib/yt_dlp/extractor/youtube.py +++ b/lib/yt_dlp/extractor/youtube.py @@ -2353,6 +2353,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'format': '17', # 3gp format available on android 'extractor_args': {'youtube': {'player_client': ['android']}}, }, + 'skip': 'android client broken', }, { # Skip download of additional client configs (remix client config in this case) @@ -2730,7 +2731,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'heatmap': 'count:100', }, 'params': { - 'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}}, + 'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}}, }, }, ] @@ -3317,7 +3318,36 @@ def _extract_heatmap(self, data): 'value': ('intensityScoreNormalized', {float_or_none}), })) or None - def _extract_comment(self, comment_renderer, parent=None): + def _extract_comment(self, entities, parent=None): + comment_entity_payload = get_first(entities, ('payload', 'commentEntityPayload', {dict})) + if not (comment_id := traverse_obj(comment_entity_payload, ('properties', 'commentId', {str}))): + return + + toolbar_entity_payload = get_first(entities, ('payload', 'engagementToolbarStateEntityPayload', {dict})) + time_text = traverse_obj(comment_entity_payload, ('properties', 'publishedTime', {str})) or '' + + return { + 'id': comment_id, + 'parent': parent or 'root', + **traverse_obj(comment_entity_payload, { + 'text': ('properties', 'content', 'content', {str}), + 'like_count': ('toolbar', 'likeCountA11y', {parse_count}), + 'author_id': ('author', 'channelId', {self.ucid_or_none}), + 'author': ('author', 'displayName', {str}), + 'author_thumbnail': ('author', 'avatarThumbnailUrl', {url_or_none}), + 'author_is_uploader': ('author', 'isCreator', {bool}), + 'author_is_verified': ('author', 'isVerified', {bool}), + 'author_url': ('author', 'channelCommand', 'innertubeCommand', ( + ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url') + ), {lambda x: urljoin('https://www.youtube.com', x)}), + }, get_all=False), + 'is_favorited': (None if toolbar_entity_payload is None else + toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED'), + '_time_text': time_text, # FIXME: non-standard, but we need a way of showing that it is an estimate. + 'timestamp': self._parse_time_text(time_text), + } + + def _extract_comment_old(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: return @@ -3398,21 +3428,39 @@ def extract_header(contents): break return _continuation - def extract_thread(contents): + def extract_thread(contents, entity_payloads): if not parent: tracker['current_page_thread'] = 0 for content in contents: if not parent and tracker['total_parent_comments'] >= max_parents: yield comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer']) - comment_renderer = get_first( - (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]], - expected_type=dict, default={}) - comment = self._extract_comment(comment_renderer, parent) + # old comment format + if not entity_payloads: + comment_renderer = get_first( + (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]], + expected_type=dict, default={}) + + comment = self._extract_comment_old(comment_renderer, parent) + + # new comment format + else: + view_model = ( + traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel', {dict})) + or traverse_obj(content, ('commentViewModel', {dict}))) + comment_keys = traverse_obj(view_model, (('commentKey', 'toolbarStateKey'), {str})) + if not comment_keys: + continue + entities = traverse_obj(entity_payloads, lambda _, v: v['entityKey'] in comment_keys) + comment = self._extract_comment(entities, parent) + if comment: + comment['is_pinned'] = traverse_obj(view_model, ('pinnedText', {str})) is not None + if not comment: continue comment_id = comment['id'] + if comment.get('is_pinned'): tracker['pinned_comment_ids'].add(comment_id) # Sometimes YouTube may break and give us infinite looping comments. @@ -3505,7 +3553,7 @@ def extract_thread(contents): check_get_keys = None if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0): check_get_keys = [[*continuation_items_path, ..., ( - 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]] + 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]] try: response = self._extract_response( item_id=None, query=continuation, @@ -3529,6 +3577,7 @@ def extract_thread(contents): raise is_forced_continuation = False continuation = None + mutations = traverse_obj(response, ('frameworkUpdates', 'entityBatchUpdate', 'mutations', ..., {dict})) for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]): if is_first_continuation: continuation = extract_header(continuation_items) @@ -3537,7 +3586,7 @@ def extract_thread(contents): break continue - for entry in extract_thread(continuation_items): + for entry in extract_thread(continuation_items, mutations): if not entry: return yield entry @@ -3614,8 +3663,6 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, yt_query = { 'videoId': video_id, } - if _split_innertube_client(client)[0] in ('android', 'android_embedscreen'): - yt_query['params'] = 'CgIIAQ==' pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0] if pp_arg: @@ -3631,19 +3678,24 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, def _get_requested_clients(self, url, smuggled_data): requested_clients = [] - default = ['ios', 'android', 'web'] + android_clients = [] + default = ['ios', 'web'] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) for client in self._configuration_arg('player_client'): - if client in allowed_clients: - requested_clients.append(client) - elif client == 'default': + if client == 'default': requested_clients.extend(default) elif client == 'all': requested_clients.extend(allowed_clients) - else: + elif client not in allowed_clients: self.report_warning(f'Skipping unsupported client {client}') + elif client.startswith('android'): + android_clients.append(client) + else: + requested_clients.append(client) + # Force deprioritization of broken Android clients for format de-duplication + requested_clients.extend(android_clients) if not requested_clients: requested_clients = default @@ -3862,6 +3914,14 @@ def build_fragments(f): f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) + # Android client formats are broken due to integrity check enforcement + # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554 + is_broken = client_name and client_name.startswith(short_client_name('android')) + if is_broken: + self.report_warning( + f'{video_id}: Android client formats are broken and may yield HTTP Error 403. ' + 'They will be deprioritized', only_once=True) + name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' fps = int_or_none(fmt.get('fps')) or 0 dct = { @@ -3874,7 +3934,7 @@ def build_fragments(f): name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), - throttled and 'THROTTLED', is_damaged and 'DAMAGED', + throttled and 'THROTTLED', is_damaged and 'DAMAGED', is_broken and 'BROKEN', (self.get_param('verbose') or all_formats) and client_name, delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 @@ -3892,8 +3952,8 @@ def build_fragments(f): 'language': join_nonempty(audio_track.get('id', '').split('.')[0], 'desc' if language_preference < -1 else '') or None, 'language_preference': language_preference, - # Strictly de-prioritize damaged and 3gp formats - 'preference': -10 if is_damaged else -2 if itag == '17' else None, + # Strictly de-prioritize broken, damaged and 3gp formats + 'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') diff --git a/lib/yt_dlp_version b/lib/yt_dlp_version index 23cd76177..1d2f1da02 100644 --- a/lib/yt_dlp_version +++ b/lib/yt_dlp_version @@ -1 +1 @@ -351dc0bc334c4e1b5f00c152818c3ec0ed71f788 \ No newline at end of file +12d8ea8246fa901de302ff5cc748caddadc82f41 \ No newline at end of file